aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td116
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll244
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll234
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir72
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll405
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll203
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll180
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll66
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll164
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll278
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll527
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll292
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.ll545
-rw-r--r--llvm/test/CodeGen/AMDGPU/add.v2i16.ll268
-rw-r--r--llvm/test/CodeGen/AMDGPU/amd.endpgm.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1225
-rw-r--r--llvm/test/CodeGen/AMDGPU/and.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/anyext.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll524
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll2005
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll2155
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll456
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll526
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfe-combine.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfe-patterns.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfi_int.ll136
-rw-r--r--llvm/test/CodeGen/AMDGPU/bfm.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/bitreverse.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/br_cc.f16.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bswap.ll156
-rw-r--r--llvm/test/CodeGen/AMDGPU/build_vector.ll44
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll324
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll850
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp-modifier.ll418
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp.ll1334
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll257
-rw-r--r--llvm/test/CodeGen/AMDGPU/copy_to_scc.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll86
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop64.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll305
-rw-r--r--llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-alignment.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/fabs.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/fadd.f16.ll422
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll722
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcmp.f16.ll638
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll415
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll356
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll310
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.f16.ll398
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv.ll107
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll184
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics.ll336
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll2234
-rw-r--r--llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll324
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma-combine.ll162
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3.ll448
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmaximum.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll2790
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin3.ll664
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/fminimum.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmul.f16.ll342
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll752
-rw-r--r--llvm/test/CodeGen/AMDGPU/fnearbyint.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll84
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-fabs.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg.ll274
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-classify.ll282
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll262
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_sint.ll126
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp_to_uint.ll104
-rw-r--r--llvm/test/CodeGen/AMDGPU/fpext.f16.ll586
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptosi.f16.ll266
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoui.f16.ll266
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll972
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll812
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll196
-rw-r--r--llvm/test/CodeGen/AMDGPU/fshl.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/fshr.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/fsub.f16.ll252
-rw-r--r--llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics.ll650
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll180
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll2644
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll564
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll2044
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll1188
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll1188
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll1612
-rw-r--r--llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/idiv-licm.ll507
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot2.ll367
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4s.ll607
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll865
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot8s.ll258
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot8u.ll259
-rw-r--r--llvm/test/CodeGen/AMDGPU/imm.ll642
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll442
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-args.ll564
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll380
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll124
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll1072
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll1860
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll37
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll28
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll736
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll1268
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll66
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll224
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll192
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll150
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll310
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll310
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll334
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll96
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll214
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll596
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp.ll209
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp10.ll209
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp2.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll536
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log.ll124
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log10.ll124
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.log2.ll156
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll524
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll520
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.mulo.ll424
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll38
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.round.ll426
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-f64.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i1.ll3494
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i16.ll1583
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i32.ll413
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i64.ll88
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll2533
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i16.ll582
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i32.ll448
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll78
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad.u16.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_64_32.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/madak.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll420
-rw-r--r--llvm/test/CodeGen/AMDGPU/max.i16.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/memory_clause.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/min.ll132
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul.ll1510
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_int24.ll94
-rw-r--r--llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/offset-split-flat.ll890
-rw-r--r--llvm/test/CodeGen/AMDGPU/offset-split-global.ll722
-rw-r--r--llvm/test/CodeGen/AMDGPU/omod.ll88
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/or.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/packed-op-sel.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernargs.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/rcp-pattern.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotl.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/rotr.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/saddo.ll258
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv.ll278
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll370
-rw-r--r--llvm/test/CodeGen/AMDGPU/select.f16.ll630
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl.ll386
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl.v2i16.ll140
-rw-r--r--llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll1364
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/sign_extend.ll184
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll354
-rw-r--r--llvm/test/CodeGen/AMDGPU/sitofp.f16.ll182
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll5170
-rw-r--r--llvm/test/CodeGen/AMDGPU/sra.ll274
-rw-r--r--llvm/test/CodeGen/AMDGPU/srl.ll70
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.ll236
-rw-r--r--llvm/test/CodeGen/AMDGPU/sub.v2i16.ll304
-rw-r--r--llvm/test/CodeGen/AMDGPU/trap-abis.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc-combine.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll178
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv.ll230
-rw-r--r--llvm/test/CodeGen/AMDGPU/udivrem.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll284
-rw-r--r--llvm/test/CodeGen/AMDGPU/uitofp.f16.ll182
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-cfg.ll150
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll178
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_cndmask.ll250
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_madak_f16.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_pack.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll282
-rw-r--r--llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/xor.ll174
-rw-r--r--llvm/test/CodeGen/AMDGPU/zero_extend.ll2
265 files changed, 43685 insertions, 43512 deletions
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4551a3a..9fbedce 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -867,13 +867,104 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
+class SMRDAlignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is a naturally aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size <= 4) || (Ld->getAlign().value() >= PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size <= 4) || (Ld.getMMO().getAlign().value() >= PowerOf2Ceil(Size));
+ }];
+}
+
+class SMRDUnalignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is an under aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size > 4) && (Ld->getAlign().value() < PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size > 4) && (Ld.getMMO().getAlign().value() < PowerOf2Ceil(Size));
+ }];
+}
+
+def alignedmultidwordload : SMRDAlignedLoadPat<smrd_load>;
+def unalignedmultidwordload : SMRDUnalignedLoadPat<smrd_load>;
+
+multiclass SMRD_Align_Pattern <string Instr, ValueType vt> {
+
+ // 1. IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+
+ // 2. SGPR offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_ec") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 3. SGPR+IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (alignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (vt (unalignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+}
+
multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
- >;
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
+ }
// 2. 32-bit IMM offset on CI
if immci then def : GCNPat <
@@ -886,26 +977,17 @@ multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
- let OtherPredicates = [isNotGFX9Plus];
- }
- def : GCNPat <
- (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ let OtherPredicates = [isGFX6GFX7];
}
- // 4. SGPR+IMM offset
+ // 4. No offset
def : GCNPat <
- (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ (vt (smrd_load (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
}
- // 5. No offset
- def : GCNPat <
- (vt (smrd_load (i64 SReg_64:$sbase))),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
- >;
+ defm : SMRD_Align_Pattern<Instr, vt>;
}
multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index a38b6e3..9a8672d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -7,11 +7,11 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -23,10 +23,10 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -59,11 +59,11 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s0, s6, s2
+; GFX11-NEXT: s_subb_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -75,10 +75,10 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index bb5ccc3..57a8bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -113,9 +113,9 @@ bb1:
define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
; WAVE64-LABEL: brcond_sgpr_trunc_and:
; WAVE64: ; %bb.0: ; %entry
-; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE64-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE64-NEXT: s_and_b32 s0, s0, s1
+; WAVE64-NEXT: s_and_b32 s0, s2, s3
; WAVE64-NEXT: s_xor_b32 s0, s0, 1
; WAVE64-NEXT: s_and_b32 s0, s0, 1
; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
@@ -131,9 +131,9 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
;
; WAVE32-LABEL: brcond_sgpr_trunc_and:
; WAVE32: ; %bb.0: ; %entry
-; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE32-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE32-NEXT: s_and_b32 s0, s0, s1
+; WAVE32-NEXT: s_and_b32 s0, s2, s3
; WAVE32-NEXT: s_xor_b32 s0, s0, 1
; WAVE32-NEXT: s_and_b32 s0, s0, 1
; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 3f034ea..9cabe0c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1400,11 +1400,11 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
@@ -1412,8 +1412,8 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_add_f32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index a018ea5..ce0d9c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -27,10 +27,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -43,10 +43,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 4e94a64..081e257 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1021,20 +1021,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1044,20 +1044,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1067,20 +1067,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1090,21 +1090,21 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
@@ -1112,14 +1112,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
@@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB39_2:
@@ -1162,13 +1162,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB40_2:
@@ -1184,14 +1184,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB40_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB40_2:
@@ -1204,21 +1204,21 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB41_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
@@ -1226,14 +1226,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB41_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB42_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB42_2:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB42_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB42_2:
@@ -1480,34 +1480,34 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB49_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
@@ -1522,14 +1522,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB49_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB49_2:
@@ -1542,11 +1542,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1566,10 +1566,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1583,11 +1583,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1595,10 +1595,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1612,11 +1612,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1637,10 +1637,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1761,19 +1761,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1803,11 +1803,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1825,10 +1825,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1842,19 +1842,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1884,19 +1884,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 05cdb54..4635db9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -38,20 +38,20 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e32 v2, v2
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, v1, s2
+; VI-NEXT: v_fma_f16 v2, -v0, v1, s0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -88,16 +88,16 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s0
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -134,16 +134,16 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s0
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -189,14 +189,14 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s6
+; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
@@ -207,9 +207,9 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
+; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6
; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, -v1, v0, s2
+; VI-NEXT: v_fma_f32 v2, -v1, v0, s6
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -244,16 +244,16 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s0
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f32_e32 v0, s1
+; VI-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f32 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -288,16 +288,16 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s0
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f32_e32 v0, s1
+; VI-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f32 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -342,15 +342,15 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
+; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -359,9 +359,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -402,23 +402,23 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -459,23 +459,23 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -546,31 +546,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: s_lshr_b32 s3, s0, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; VI-NEXT: s_lshr_b32 s3, s1, 16
; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 16
; VI-NEXT: v_rcp_f32_e32 v3, v3
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
+; VI-NEXT: v_fma_f16 v0, -v0, v1, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v1, s2
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
+; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s2
; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v1, -v1, v2, s1
+; VI-NEXT: v_fma_f16 v1, -v1, v2, s2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -683,47 +683,47 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: s_lshr_b32 s8, s0, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; VI-NEXT: s_lshr_b32 s8, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: s_lshr_b32 s6, s2, 16
+; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
+; VI-NEXT: v_cvt_f32_f16_e32 v4, s3
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_rcp_f32_e32 v4, v4
-; VI-NEXT: s_lshr_b32 s9, s1, 16
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: s_lshr_b32 s9, s3, 16
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
+; VI-NEXT: v_fma_f16 v0, -v0, v1, s0
; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s7, s3, 16
+; VI-NEXT: s_lshr_b32 s7, s1, 16
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_rcp_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
; VI-NEXT: v_trunc_f16_e32 v1, v1
; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_mul_f32_e32 v2, v2, v4
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_mov_b32_e32 v4, s9
-; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
+; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s1
; VI-NEXT: v_trunc_f16_e32 v2, v2
-; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
+; VI-NEXT: v_fma_f16 v2, -v2, v3, s1
; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
; VI-NEXT: v_mul_f32_e32 v3, v3, v5
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
@@ -793,14 +793,14 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0
+; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
@@ -811,12 +811,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
+; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s0
; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v0, -v1, v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3
-; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
+; VI-NEXT: v_fma_f32 v0, -v1, v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s1
+; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1
; VI-NEXT: v_rcp_f32_e32 v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
@@ -827,9 +827,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
+; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s1
; VI-NEXT: v_trunc_f32_e32 v2, v2
-; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
+; VI-NEXT: v_fma_f32 v1, -v2, v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 5d48168..83a85c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -16,9 +16,9 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
@@ -64,9 +64,9 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index c444772..7bbce45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -167,15 +167,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_v2s32_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -210,15 +210,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_v4s16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -254,15 +254,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v4s32_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -342,15 +342,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_s64_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -386,15 +386,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v2s64
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -782,15 +782,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s16
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -826,15 +826,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s32
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
@@ -870,15 +870,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
;
; GFX10-LABEL: name: load_constant_v16s32
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -914,15 +914,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s64
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
index 7587aa0..2a725ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds
; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc
; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0
- ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0
+ ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 36, 0
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
; GCN: $m0 = S_MOV_B32 -1
; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 1a49a38..4671f60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: test_div_scale_f32_1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -40,38 +40,38 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -110,11 +110,11 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: test_div_scale_f32_2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -123,38 +123,38 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v1, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -195,7 +195,6 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -208,8 +207,10 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -217,30 +218,32 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -281,7 +284,6 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -294,8 +296,10 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -303,30 +307,32 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -661,7 +667,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -672,33 +678,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[2:3]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_num_1:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -736,7 +744,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -747,33 +755,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], v[0:1], s[2:3]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_num_2:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], v[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -811,7 +821,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -822,33 +832,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], s[2:3], v[0:1]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_den_1:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[2:3], v[0:1]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -886,7 +898,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -897,33 +909,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[2:3], v[0:1]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_den_2:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[2:3], v[0:1]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -957,12 +971,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -981,13 +996,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out
; GFX11-LABEL: test_div_scale_f32_all_scalar_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x4c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x70
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1015,12 +1030,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1039,13 +1055,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out
; GFX11-LABEL: test_div_scale_f32_all_scalar_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x4c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x70
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1074,13 +1090,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1090,22 +1107,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1134,13 +1153,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1150,22 +1170,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1195,42 +1217,42 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
;
; GFX8-LABEL: test_div_scale_f32_inline_imm_num:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, 1.0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1264,42 +1286,42 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
;
; GFX8-LABEL: test_div_scale_f32_inline_imm_den:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], 2.0, 2.0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, 2.0, 2.0, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1336,11 +1358,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: test_div_scale_f32_fabs_num:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
@@ -1350,41 +1372,41 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_fabs_num:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_num:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1426,11 +1448,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: test_div_scale_f32_fabs_den:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -1440,41 +1462,41 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_fabs_den:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_den:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1509,29 +1531,30 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_val_undef_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_val_undef_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1557,29 +1580,30 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_undef_val_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, 0x41000000, 0x41000000, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_val_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1603,29 +1627,30 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
; GFX8-LABEL: test_div_scale_f32_undef_undef_val:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_undef_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1653,29 +1678,29 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_val_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_val_undef_val:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 87d0d71..a4aea63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -843,75 +843,47 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
}
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0
-; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000
-; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
-; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
-; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: flat_load_dword v2, v[0:1]
-; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
-; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1030-NEXT: s_endpgm
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
-; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1013-NEXT: v_mov_b32_e32 v3, 0
-; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0
-; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000
-; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
-; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
-; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s2
-; GFX1013-NEXT: v_mov_b32_e32 v1, s3
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: flat_load_dword v2, v[0:1]
-; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
-; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1013-NEXT: s_endpgm
+; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX10-NEXT: v_mov_b32_e32 v7, 4.0
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x40a00000
+; GFX10-NEXT: v_mov_b32_e32 v9, 0x40c00000
+; GFX10-NEXT: v_mov_b32_e32 v10, 0x40e00000
+; GFX10-NEXT: v_mov_b32_e32 v11, 0x41000000
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c7
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_movk_i32 s17, 0x102
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s8, 0x40400000
; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
-; GFX11-NEXT: s_mov_b32 s6, 2.0
+; GFX11-NEXT: s_mov_b32 s1, 1.0
; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
; GFX11-NEXT: s_mov_b32 s9, 4.0
; GFX11-NEXT: s_mov_b32 s14, 0x41000000
@@ -921,18 +893,18 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX11-NEXT: v_mov_b32_e32 v7, s13
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_mov_b32 s5, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_mov_b32_e32 v10, s17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
@@ -954,84 +926,59 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
}
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0
-; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200
-; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
-; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: flat_load_dword v2, v[0:1]
-; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
-; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1030-NEXT: s_endpgm
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
-; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1013-NEXT: v_mov_b32_e32 v3, 0
-; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200
-; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
-; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
-; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s2
-; GFX1013-NEXT: v_mov_b32_e32 v1, s3
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: flat_load_dword v2, v[0:1]
-; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
-; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1013-NEXT: s_endpgm
+; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0x44004200
+; GFX10-NEXT: v_mov_b32_e32 v7, 0x46004500
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x48004700
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c6
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_movk_i32 s13, 0x102
-; GFX11-NEXT: s_mov_b32 s6, 2.0
+; GFX11-NEXT: s_mov_b32 s1, 1.0
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s8, 0x42004600
; GFX11-NEXT: s_mov_b32 s9, 0x44004700
; GFX11-NEXT: s_mov_b32 s10, 0x45004800
; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: s_mov_b32 s5, 1.0
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index e7faabb..66d1f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -353,8 +353,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s4, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s5, 0x405ec000
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +369,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-NEXT: v_accvgpr_write_b32 a5, s9
; GCN-NEXT: v_accvgpr_write_b32 a6, s10
; GCN-NEXT: v_accvgpr_write_b32 a7, s11
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index c0cd068..f712df2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -8,14 +8,14 @@
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
; GFX8-LABEL: dpp_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -34,12 +34,12 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
; GFX11-LABEL: dpp_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x02,0x00]
; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
@@ -50,38 +50,38 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
; GFX8-LABEL: mov_dpp64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: mov_dpp64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; encoding: [0x00,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: mov_dpp64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; encoding: [0x00,0x01,0x08,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00]
+; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; encoding: [0x06,0x00,0x10,0xca,0x07,0x00,0x00,0x00]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x04,0x00]
; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index fa24489..3d352db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,16 +4,16 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -23,17 +23,17 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -47,32 +47,32 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 56
-; GCN-NEXT: s_cselect_b32 s4, 1, 0
+; GCN-NEXT: s_cselect_b32 s1, 1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, 1
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_mov_b32 s0, 1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; GCN-NEXT: s_mov_b32 s2, 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: .LBB2_2: ; %Flow
-; GCN-NEXT: s_xor_b32 s2, s2, 1
-; GCN-NEXT: s_and_b32 s2, s2, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_xor_b32 s0, s0, 1
+; GCN-NEXT: s_and_b32 s0, s0, 1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
; GCN-NEXT: ; %bb.3: ; %.zero
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: .LBB2_4: ; %.exit
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
@@ -96,17 +96,17 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
store float %tmp, ptr addrspace(1) %out
@@ -116,21 +116,21 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
store double %tmp, ptr addrspace(1) %out
@@ -140,17 +140,17 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
store <2 x i16> %tmp, ptr addrspace(1) %out
@@ -160,17 +160,17 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
store <2 x half> %tmp, ptr addrspace(1) %out
@@ -228,17 +228,17 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
store <2 x bfloat> %tmp, ptr addrspace(1) %out
@@ -320,17 +320,17 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
store ptr %tmp, ptr addrspace(1) %out
@@ -340,16 +340,16 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
@@ -359,16 +359,16 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
@@ -378,16 +378,16 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
@@ -397,16 +397,16 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index d628270..7f720e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -46,47 +46,47 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
; GFX8-LABEL: update_dppi64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppi64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppi64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -101,47 +101,47 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
; GFX8-LABEL: update_dppf64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppf64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppf64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -266,47 +266,47 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p0_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -321,13 +321,13 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p3_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -336,11 +336,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p3_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: ds_read_b32 v1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -349,11 +349,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX11-LABEL: update_dpp_p3_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: ds_load_b32 v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -371,17 +371,17 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
; GFX8-LABEL: update_dpp_p5_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s90, -1
; GFX8-NEXT: s_mov_b32 s91, 0xe80000
; GFX8-NEXT: s_add_u32 s88, s88, s3
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -390,17 +390,17 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p5_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_mov_b32 s7, 0x31c16000
; GFX10-NEXT: s_add_u32 s4, s4, s3
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -409,11 +409,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX11-LABEL: update_dpp_p5_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: scratch_load_b32 v1, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 6bb1043..9251f26 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1017,7 +1017,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align4:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1054,7 +1056,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_i96_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1091,7 +1095,9 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1128,7 +1134,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v6i16_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1166,7 +1174,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v12i8_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s13, s0, 8
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1140ef8..e1fcca0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -7,28 +7,28 @@ declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
-; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v9, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v9, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -36,7 +36,7 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v7
-; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
+; GFX11-NEXT: global_store_b64 v9, v[4:5], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -76,12 +76,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v5, v2, s[0:1]
+; GFX11-NEXT: global_load_b32 v5, v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -128,12 +128,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -177,13 +177,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -227,12 +227,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v5, v0, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -277,12 +277,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -327,12 +327,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[1:2], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[1:2], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2
; GFX11-NEXT: v_mov_b32_e32 v0, 0
@@ -355,21 +355,21 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul_i64_masked_src0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_masked_src0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -413,12 +413,12 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -450,21 +450,21 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul64_masked_before_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_masked_before_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,13 +534,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7]
-; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1]
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: global_load_b64 v[4:5], v0, s[2:3]
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 2d81452..35de4a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2559,76 +2559,76 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT: s_mulk_i32 s2, 0x50
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT: s_mulk_i32 s0, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_sregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2738,88 +2738,88 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT: s_ashr_i32 s3, s2, 31
-; GFX8-NEXT: s_mulk_i32 s2, 0x50
-; GFX8-NEXT: s_mulk_i32 s3, 0x50
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_add_u32 s3, s3, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_mulk_i32 s0, 0x50
+; GFX8-NEXT: s_mulk_i32 s1, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_add_u32 s1, s1, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT: s_mulk_i32 s4, 0x50
-; GFX9-NEXT: s_add_u32 s3, s4, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s2, s1, 31
+; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT: s_mulk_i32 s2, 0x50
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_ashr_i32 s3, s2, 31
-; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
-; GFX10-NEXT: s_mulk_i32 s3, 0x50
-; GFX10-NEXT: s_mulk_i32 s2, 0x50
-; GFX10-NEXT: s_add_i32 s3, s4, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50
+; GFX10-NEXT: s_mulk_i32 s1, 0x50
+; GFX10-NEXT: s_mulk_i32 s0, 0x50
+; GFX10-NEXT: s_add_i32 s1, s2, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_ashr_i32 s3, s2, 31
-; GFX11-NEXT: s_mul_hi_u32 s4, s2, 0x50
-; GFX11-NEXT: s_mulk_i32 s3, 0x50
-; GFX11-NEXT: s_mulk_i32 s2, 0x50
-; GFX11-NEXT: s_add_i32 s3, s4, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_mul_hi_u32 s2, s0, 0x50
+; GFX11-NEXT: s_mulk_i32 s1, 0x50
+; GFX11-NEXT: s_mulk_i32 s0, 0x50
+; GFX11-NEXT: s_add_i32 s1, s2, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_sext_with_sregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index c3bd566..5d4f1f65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -145,25 +145,25 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: sdivrem_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_ashr_i32 s2, s9, 31
-; GFX8-NEXT: s_ashr_i32 s12, s11, 31
-; GFX8-NEXT: s_add_u32 s0, s8, s2
-; GFX8-NEXT: s_addc_u32 s1, s9, s2
-; GFX8-NEXT: s_add_u32 s8, s10, s12
-; GFX8-NEXT: s_mov_b32 s13, s12
-; GFX8-NEXT: s_addc_u32 s9, s11, s12
-; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8
+; GFX8-NEXT: s_ashr_i32 s2, s13, 31
+; GFX8-NEXT: s_ashr_i32 s4, s15, 31
+; GFX8-NEXT: s_add_u32 s0, s12, s2
+; GFX8-NEXT: s_addc_u32 s1, s13, s2
+; GFX8-NEXT: s_add_u32 s6, s14, s4
+; GFX8-NEXT: s_mov_b32 s5, s4
+; GFX8-NEXT: s_addc_u32 s7, s15, s4
+; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX8-NEXT: s_mov_b32 s3, s2
-; GFX8-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3]
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: s_sub_u32 s14, 0, s8
-; GFX8-NEXT: s_subb_u32 s15, 0, s9
+; GFX8-NEXT: s_sub_u32 s14, 0, s6
+; GFX8-NEXT: s_subb_u32 s15, 0, s7
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
@@ -223,53 +223,53 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s11, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s10, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v6, s13
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
+; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s6, v0
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v8
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -284,7 +284,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
@@ -295,35 +295,35 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sdivrem_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s2, s9, 31
-; GFX9-NEXT: s_ashr_i32 s12, s11, 31
-; GFX9-NEXT: s_add_u32 s0, s8, s2
-; GFX9-NEXT: s_addc_u32 s1, s9, s2
-; GFX9-NEXT: s_add_u32 s8, s10, s12
-; GFX9-NEXT: s_mov_b32 s13, s12
-; GFX9-NEXT: s_addc_u32 s9, s11, s12
-; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8
+; GFX9-NEXT: s_ashr_i32 s2, s13, 31
+; GFX9-NEXT: s_ashr_i32 s4, s15, 31
+; GFX9-NEXT: s_add_u32 s0, s12, s2
+; GFX9-NEXT: s_addc_u32 s1, s13, s2
+; GFX9-NEXT: s_add_u32 s6, s14, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_addc_u32 s7, s15, s4
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX9-NEXT: s_mov_b32 s3, s2
-; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3]
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_u32 s14, 0, s8
-; GFX9-NEXT: s_subb_u32 s15, 0, s9
+; GFX9-NEXT: s_sub_u32 s14, 0, s6
+; GFX9-NEXT: s_subb_u32 s15, 0, s7
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
@@ -357,7 +357,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
@@ -382,52 +382,52 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s11, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s10, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v6, s13
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s12, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
+; GFX9-NEXT: v_sub_u32_e32 v1, s13, v1
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0
+; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -442,7 +442,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5
; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -453,27 +453,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v6, s2
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_ashr_i32 s2, s9, 31
-; GFX10-NEXT: s_ashr_i32 s12, s11, 31
-; GFX10-NEXT: s_add_u32 s0, s8, s2
-; GFX10-NEXT: s_addc_u32 s1, s9, s2
-; GFX10-NEXT: s_add_u32 s8, s10, s12
-; GFX10-NEXT: s_mov_b32 s13, s12
-; GFX10-NEXT: s_addc_u32 s9, s11, s12
+; GFX10-NEXT: s_ashr_i32 s2, s13, 31
+; GFX10-NEXT: s_ashr_i32 s4, s15, 31
+; GFX10-NEXT: s_add_u32 s0, s12, s2
+; GFX10-NEXT: s_addc_u32 s1, s13, s2
+; GFX10-NEXT: s_add_u32 s6, s14, s4
+; GFX10-NEXT: s_mov_b32 s5, s4
+; GFX10-NEXT: s_addc_u32 s7, s15, s4
; GFX10-NEXT: s_mov_b32 s3, s2
-; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8
-; GFX10-NEXT: s_sub_u32 s10, 0, s8
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX10-NEXT: s_sub_u32 s12, 0, s6
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -484,11 +484,12 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s11, s10, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s11, s10, v4, v[1:2]
-; GFX10-NEXT: s_subb_u32 s11, 0, s9
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s13, s12, v3, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2]
+; GFX10-NEXT: s_subb_u32 s13, 0, s7
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s11, v3, v[1:2]
+; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
@@ -510,28 +511,28 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s10, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s10, v4, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s12, v3, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s12, v4, v[1:2]
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s11, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s13, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1
; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1
; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v2, s10, v6, v8
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v6, s12, v7, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v2, s12, v6, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10
+; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0
@@ -540,71 +541,70 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1
; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v3
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v3
; GFX10-NEXT: v_mul_hi_u32 v3, s0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v0, s10, v5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v0, s12, v5, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
-; GFX10-NEXT: v_add_co_u32 v0, s10, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v5, s10, v0, v2
+; GFX10-NEXT: v_add_co_u32 v0, s12, v0, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v5, s12, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v5, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s12, s6, v5, 0
; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s8, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s6, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s7, v5, v[1:2]
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1
; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s8
+; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v1
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v9
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v9
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v9
; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0
-; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s8
+; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo
-; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13]
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_xor_b32_e32 v2, s8, v2
-; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3
+; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3
; GFX10-NEXT: v_xor_b32_e32 v5, s2, v0
; GFX10-NEXT: v_xor_b32_e32 v6, s2, v1
-; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s8
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v3, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s4
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v5, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = sdiv i64 %x, %y
store i64 %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 63a0d8a..51c213e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -112,12 +112,12 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: udivrem_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX8-NEXT: s_sub_u32 s2, 0, s10
-; GFX8-NEXT: s_subb_u32 s3, 0, s11
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT: s_sub_u32 s2, 0, s14
+; GFX8-NEXT: s_subb_u32 s3, 0, s15
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -180,53 +180,53 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v6, s13
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v4, v[1:2]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v0
; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s10, v2
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v2
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s14, v7
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
@@ -241,22 +241,22 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v14, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udivrem_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX9-NEXT: s_sub_u32 s2, 0, s10
-; GFX9-NEXT: s_subb_u32 s3, 0, s11
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: s_subb_u32 s3, 0, s15
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -293,7 +293,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-NEXT: v_mov_b32_e32 v7, s15
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
@@ -318,52 +318,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v5, 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v6, s13
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v5, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v0
; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
-; GFX9-NEXT: v_sub_u32_e32 v0, s9, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6
+; GFX9-NEXT: v_sub_u32_e32 v0, s13, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v9
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v8
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
@@ -378,17 +378,17 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX10-NEXT: s_sub_u32 s0, 0, s10
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX10-NEXT: s_sub_u32 s0, 0, s14
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -401,7 +401,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v3, 0
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
-; GFX10-NEXT: s_subb_u32 s1, 0, s11
+; GFX10-NEXT: s_subb_u32 s1, 0, s15
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
@@ -449,14 +449,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0
+; GFX10-NEXT: v_mul_lo_u32 v2, s13, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX10-NEXT: v_mul_lo_u32 v5, s9, v1
+; GFX10-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX10-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX10-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX10-NEXT: v_mul_lo_u32 v5, s13, v1
; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3
-; GFX10-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX10-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v4
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
@@ -466,38 +466,38 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2
-; GFX10-NEXT: v_mul_hi_u32 v2, s9, v1
+; GFX10-NEXT: v_mul_hi_u32 v2, s13, v1
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v5, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s14, v5, 0
; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s10, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s11, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2]
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, s9, v1
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7
+; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s12, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, s13, v1
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v7
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s10
+; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s14
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v8
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v8
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v6
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v6
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v9
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v9
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v9
; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v8
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v8
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0
-; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s10
+; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s14
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
@@ -509,8 +509,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v9, s0
-; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
-; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[6:7]
+; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv i64 %x, %y
store i64 %div, ptr addrspace(1) %out0
@@ -979,13 +979,13 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: udivrem_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12
-; GFX8-NEXT: s_sub_u32 s2, 0, s12
-; GFX8-NEXT: s_subb_u32 s3, 0, s13
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s16
+; GFX8-NEXT: s_sub_u32 s2, 0, s16
+; GFX8-NEXT: s_subb_u32 s3, 0, s17
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -1025,12 +1025,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: s_sub_u32 s2, 0, s14
+; GFX8-NEXT: s_sub_u32 s2, 0, s18
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: s_subb_u32 s3, 0, s15
+; GFX8-NEXT: s_subb_u32 s3, 0, s19
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
@@ -1050,46 +1050,46 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
-; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0
+; GFX8-NEXT: v_mul_hi_u32 v4, s13, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v4, s13
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v3, s13
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v6, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v4, s17
; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
+; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v8
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v0
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s19
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s18
; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8
+; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s16, v8
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
@@ -1101,13 +1101,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v11
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v10
; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v11
; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1]
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3]
; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12
@@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc
; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1
; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s16, v10
; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc
; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
@@ -1175,55 +1175,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3
-; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, s15, v3
+; GFX8-NEXT: v_mul_lo_u32 v8, s14, v4
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3
-; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3
+; GFX8-NEXT: v_mul_hi_u32 v0, s14, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s15, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, s15, v4
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0
-; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, s14, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0
-; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0
+; GFX8-NEXT: v_mul_hi_u32 v8, s15, v4
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s18, v9, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s11
-; GFX8-NEXT: v_mov_b32_e32 v0, s15
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8]
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s18, v10, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s15
+; GFX8-NEXT: v_mov_b32_e32 v0, s19
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s19, v9, v[7:8]
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s14, v3
; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc
-; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s15, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v11
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v8
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v11
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s18, v8
; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc
; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9
; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v12
; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12
-; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v12
+; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s18, v7
; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13
; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
@@ -1234,30 +1234,30 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s9
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v9, s4
+; GFX8-NEXT: v_mov_b32_e32 v9, s8
; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1]
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udivrem_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX9-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12
-; GFX9-NEXT: s_sub_u32 s2, 0, s12
-; GFX9-NEXT: s_subb_u32 s3, 0, s13
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s16
+; GFX9-NEXT: s_sub_u32 s2, 0, s16
+; GFX9-NEXT: s_subb_u32 s3, 0, s17
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: s_sub_u32 s2, 0, s18
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: s_subb_u32 s3, 0, s15
+; GFX9-NEXT: s_subb_u32 s3, 0, s19
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1317,48 +1317,47 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, s17
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5
; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v1
; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v2
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v1
-; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v1
+; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s15
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s19
; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s14
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s18
; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s16, v2
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
@@ -1370,13 +1369,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v12
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v11
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v12
; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13
@@ -1385,7 +1384,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3
; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11
+; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s16, v11
; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
@@ -1441,55 +1440,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s11, v5
-; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, s15, v5
+; GFX9-NEXT: v_mul_lo_u32 v9, s14, v6
; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1]
-; GFX9-NEXT: v_mul_hi_u32 v2, s10, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, s11, v5
+; GFX9-NEXT: v_mul_hi_u32 v2, s14, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, s15, v5
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, s15, v6
; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
-; GFX9-NEXT: v_mul_hi_u32 v9, s10, v6
-; GFX9-NEXT: v_mul_hi_u32 v13, s11, v6
+; GFX9-NEXT: v_mul_hi_u32 v9, s14, v6
+; GFX9-NEXT: v_mul_hi_u32 v13, s15, v6
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s14, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v1, v11, v9
; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13
; GFX9-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v10, s11
-; GFX9-NEXT: v_mov_b32_e32 v6, s15
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v10, s15
+; GFX9-NEXT: v_mov_b32_e32 v6, s19
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v5
; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10
-; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v10
+; GFX9-NEXT: v_sub_u32_e32 v1, s15, v1
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v10
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s18, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12
; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v13
; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v11
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v11
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v13
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s18, v11
; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1504,22 +1503,24 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1]
-; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[4:5]
-; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[6:7]
+; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[8:9]
+; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15
-; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12
-; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s14
-; GFX10-NEXT: s_sub_u32 s0, 0, s12
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19
+; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s16
+; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s18
+; GFX10-NEXT: s_sub_u32 s0, 0, s16
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT: s_subb_u32 s1, 0, s13
+; GFX10-NEXT: s_subb_u32 s1, 0, s17
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -1539,13 +1540,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v7, 0
-; GFX10-NEXT: s_sub_u32 s2, 0, s14
+; GFX10-NEXT: s_sub_u32 s2, 0, s18
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s3, s2, v8, 0
; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0
-; GFX10-NEXT: s_subb_u32 s3, 0, s15
+; GFX10-NEXT: s_subb_u32 s3, 0, s19
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5]
; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6]
@@ -1592,7 +1593,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4]
@@ -1641,21 +1641,20 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v8, v1
; GFX10-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mul_lo_u32 v3, s9, v4
-; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2
-; GFX10-NEXT: v_mul_hi_u32 v5, s8, v4
-; GFX10-NEXT: v_mul_hi_u32 v4, s9, v4
-; GFX10-NEXT: v_mul_lo_u32 v9, s9, v2
-; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1
-; GFX10-NEXT: v_mul_hi_u32 v10, s8, v2
-; GFX10-NEXT: v_mul_hi_u32 v11, s9, v2
-; GFX10-NEXT: v_mul_lo_u32 v2, s10, v0
-; GFX10-NEXT: v_mul_hi_u32 v7, s10, v1
-; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1
-; GFX10-NEXT: v_mul_lo_u32 v12, s11, v0
-; GFX10-NEXT: v_mul_hi_u32 v13, s10, v0
-; GFX10-NEXT: v_mul_hi_u32 v14, s11, v0
+; GFX10-NEXT: v_mul_lo_u32 v3, s13, v4
+; GFX10-NEXT: v_mul_lo_u32 v8, s12, v2
+; GFX10-NEXT: v_mul_hi_u32 v5, s12, v4
+; GFX10-NEXT: v_mul_hi_u32 v4, s13, v4
+; GFX10-NEXT: v_mul_lo_u32 v9, s13, v2
+; GFX10-NEXT: v_mul_lo_u32 v6, s15, v1
+; GFX10-NEXT: v_mul_hi_u32 v10, s12, v2
+; GFX10-NEXT: v_mul_hi_u32 v11, s13, v2
+; GFX10-NEXT: v_mul_lo_u32 v2, s14, v0
+; GFX10-NEXT: v_mul_hi_u32 v7, s14, v1
+; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1
+; GFX10-NEXT: v_mul_lo_u32 v12, s15, v0
+; GFX10-NEXT: v_mul_hi_u32 v13, s14, v0
+; GFX10-NEXT: v_mul_hi_u32 v14, s15, v0
; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4
@@ -1678,77 +1677,77 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s12, v8, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s16, v8, 0
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s14, v10, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s18, v10, 0
; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7
; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11
; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s12, v9, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2]
; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s14, v7, v[3:4]
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s13, v8, v[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4]
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5]
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s8, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s15, v10, v[5:6]
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v3, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v14
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, s9, v3
+; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s12, v0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6]
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s16, v14
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, s13, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0
-; GFX10-NEXT: v_sub_co_u32 v15, s0, s10, v2
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s11, v0, s0
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v15
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, s11, v0
+; GFX10-NEXT: v_sub_co_u32 v15, s0, s14, v2
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v15
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, s15, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s12
+; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s16
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v5
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s15, v0, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v18
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v5
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v18
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v17
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v17
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v18
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v18
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v16
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s19, v16
; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v5
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v5
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s12
+; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s16
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s14
+; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s18
; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v16
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v16
; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v12
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v6
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v6
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v12
; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s15, v23, s1
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s14
+; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s18
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo
@@ -1759,8 +1758,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1
; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1
; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1
-; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[4:5]
-; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[6:7]
+; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[8:9]
+; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv <2 x i64> %x, %y
store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index 037210a..a2439e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -6,36 +6,36 @@
define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i8_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i8_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_byte v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_byte v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i8, ptr addrspace(4) %in, align 4
store i8 %ld, ptr addrspace(1) %out, align 4
@@ -45,36 +45,36 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a
define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i16_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i16_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_short v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_short v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i16, ptr addrspace(4) %in, align 4
store i16 %ld, ptr addrspace(1) %out, align 4
@@ -84,39 +84,39 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr
define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: sextload_i8_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i8 s2, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i8 s2, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i8 s2, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_sext_i32_i8 s0, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%sext = sext i8 %load to i32
@@ -127,39 +127,39 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: sextload_i16_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i16 s2, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%sext = sext i16 %load to i32
@@ -170,39 +170,39 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: zextload_i8_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: zextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s2, s2, 0xff
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%zext = zext i8 %load to i32
@@ -213,39 +213,39 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: zextload_i16_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: zextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%zext = zext i16 %load to i32
@@ -256,35 +256,35 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_load_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT: global_store_byte v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
store i8 %load, ptr addrspace(1) %out, align 2
@@ -294,35 +294,35 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_load_i16_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i16_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 2
store i16 %load, ptr addrspace(1) %out, align 2
@@ -332,43 +332,43 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_sextload_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_sextload_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
+; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_sextload_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
+; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%sextload = sext i8 %load to i32
@@ -379,43 +379,43 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_zextload_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_zextload_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
+; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_zextload_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
+; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%zextload = zext i8 %load to i32
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 422e274..cdf03ae 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -22,65 +22,65 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX8-LABEL: s_add_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s0, s2, s3
+; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s2, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_add_i32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s2, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_add_i32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_add_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s2, s3
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -110,75 +110,75 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: s_add_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s0, s5, s7
-; GFX8-NEXT: s_add_i32 s1, s4, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_i32 s1, s1, s3
+; GFX8-NEXT: s_add_i32 s0, s0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s5, s7
-; GFX9-NEXT: s_add_i32 s3, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_add_i32 s1, s1, s3
+; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v2i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s4, s6
-; GFX10-NEXT: s_add_i32 s3, s5, s7
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_add_i32 s0, s0, s2
+; GFX10-NEXT: s_add_i32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s4, s6
-; GFX11-NEXT: s_add_i32 s3, s5, s7
+; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: s_add_i32 s1, s1, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_add_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s4, s6
-; GFX12-NEXT: s_add_co_i32 s3, s5, s7
+; GFX12-NEXT: s_add_co_i32 s0, s0, s2
+; GFX12-NEXT: s_add_co_i32 s1, s1, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -342,42 +342,42 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX8-LABEL: s_add_v8i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s7, s7, s15
; GFX8-NEXT: s_add_i32 s6, s6, s14
; GFX8-NEXT: s_add_i32 s5, s5, s13
; GFX8-NEXT: s_add_i32 s4, s4, s12
-; GFX8-NEXT: s_add_i32 s2, s11, s19
-; GFX8-NEXT: s_add_i32 s3, s10, s18
+; GFX8-NEXT: s_add_i32 s0, s11, s19
+; GFX8-NEXT: s_add_i32 s1, s10, s18
; GFX8-NEXT: s_add_i32 s9, s9, s17
; GFX8-NEXT: s_add_i32 s8, s8, s16
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_v8i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s7, s15
-; GFX9-NEXT: s_add_i32 s3, s6, s14
+; GFX9-NEXT: s_add_i32 s0, s7, s15
+; GFX9-NEXT: s_add_i32 s1, s6, s14
; GFX9-NEXT: s_add_i32 s6, s11, s19
; GFX9-NEXT: s_add_i32 s7, s10, s18
; GFX9-NEXT: s_add_i32 s9, s9, s17
@@ -388,23 +388,24 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v8i32:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s7, s15
-; GFX10-NEXT: s_add_i32 s3, s6, s14
+; GFX10-NEXT: s_add_i32 s0, s7, s15
+; GFX10-NEXT: s_add_i32 s1, s6, s14
; GFX10-NEXT: s_add_i32 s6, s11, s19
; GFX10-NEXT: s_add_i32 s7, s10, s18
; GFX10-NEXT: s_add_i32 s8, s8, s16
@@ -417,20 +418,20 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX10-NEXT: v_mov_b32_e32 v3, s6
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_mov_b32_e32 v5, s5
-; GFX10-NEXT: v_mov_b32_e32 v6, s3
-; GFX10-NEXT: v_mov_b32_e32 v7, s2
-; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v6, s1
+; GFX10-NEXT: v_mov_b32_e32 v7, s0
+; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] offset:16
+; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v8i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s7, s15
-; GFX11-NEXT: s_add_i32 s3, s6, s14
+; GFX11-NEXT: s_add_i32 s0, s7, s15
+; GFX11-NEXT: s_add_i32 s1, s6, s14
; GFX11-NEXT: s_add_i32 s6, s11, s19
; GFX11-NEXT: s_add_i32 s7, s10, s18
; GFX11-NEXT: s_add_i32 s8, s8, s16
@@ -440,11 +441,11 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX11-NEXT: v_mov_b32_e32 v6, s3
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_mov_b32_e32 v6, s1
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -453,10 +454,10 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s7, s15
-; GFX12-NEXT: s_add_co_i32 s3, s6, s14
+; GFX12-NEXT: s_add_co_i32 s0, s7, s15
+; GFX12-NEXT: s_add_co_i32 s1, s6, s14
; GFX12-NEXT: s_add_co_i32 s6, s11, s19
; GFX12-NEXT: s_add_co_i32 s7, s10, s18
; GFX12-NEXT: s_add_co_i32 s8, s8, s16
@@ -466,11 +467,11 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -534,7 +535,7 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX8-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s7, s7, s39
; GFX8-NEXT: s_add_i32 s6, s6, s38
@@ -548,43 +549,43 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX8-NEXT: s_add_i32 s14, s14, s46
; GFX8-NEXT: s_add_i32 s13, s13, s45
; GFX8-NEXT: s_add_i32 s12, s12, s44
-; GFX8-NEXT: s_add_i32 s2, s19, s51
-; GFX8-NEXT: s_add_i32 s3, s18, s50
+; GFX8-NEXT: s_add_i32 s0, s19, s51
+; GFX8-NEXT: s_add_i32 s1, s18, s50
; GFX8-NEXT: s_add_i32 s17, s17, s49
; GFX8-NEXT: s_add_i32 s16, s16, s48
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 48
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 32
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s15
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -592,11 +593,11 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s7, s39
-; GFX9-NEXT: s_add_i32 s3, s6, s38
+; GFX9-NEXT: s_add_i32 s0, s7, s39
+; GFX9-NEXT: s_add_i32 s1, s6, s38
; GFX9-NEXT: s_add_i32 s6, s11, s43
; GFX9-NEXT: s_add_i32 s7, s10, s42
; GFX9-NEXT: s_add_i32 s10, s15, s47
@@ -613,38 +614,38 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9-NEXT: v_mov_b32_e32 v3, s14
; GFX9-NEXT: s_add_i32 s9, s9, s41
; GFX9-NEXT: s_add_i32 s8, s8, s40
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
; GFX9-NEXT: s_add_i32 s5, s5, s37
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: s_add_i32 s4, s4, s36
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v16i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s7, s39
-; GFX10-NEXT: s_add_i32 s3, s6, s38
+; GFX10-NEXT: s_add_i32 s0, s7, s39
+; GFX10-NEXT: s_add_i32 s1, s6, s38
; GFX10-NEXT: s_add_i32 s6, s11, s43
; GFX10-NEXT: s_add_i32 s7, s10, s42
; GFX10-NEXT: s_add_i32 s10, s15, s47
@@ -673,12 +674,12 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX10-NEXT: v_mov_b32_e32 v11, s6
; GFX10-NEXT: v_mov_b32_e32 v12, s4
; GFX10-NEXT: v_mov_b32_e32 v13, s5
-; GFX10-NEXT: v_mov_b32_e32 v14, s3
-; GFX10-NEXT: v_mov_b32_e32 v15, s2
-; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
-; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
-; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v14, s1
+; GFX10-NEXT: v_mov_b32_e32 v15, s0
+; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:48
+; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:32
+; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v16i32:
@@ -686,10 +687,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s7, s39
-; GFX11-NEXT: s_add_i32 s3, s6, s38
+; GFX11-NEXT: s_add_i32 s0, s7, s39
+; GFX11-NEXT: s_add_i32 s1, s6, s38
; GFX11-NEXT: s_add_i32 s6, s11, s43
; GFX11-NEXT: s_add_i32 s7, s10, s42
; GFX11-NEXT: s_add_i32 s10, s15, s47
@@ -711,13 +712,13 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9
; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX11-NEXT: v_mov_b32_e32 v14, s3
+; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_mov_b32_e32 v14, s1
; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX11-NEXT: global_store_b128 v16, v[0:3], s[2:3] offset:48
+; GFX11-NEXT: global_store_b128 v16, v[4:7], s[2:3] offset:32
+; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v16, v[12:15], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -727,10 +728,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s7, s39
-; GFX12-NEXT: s_add_co_i32 s3, s6, s38
+; GFX12-NEXT: s_add_co_i32 s0, s7, s39
+; GFX12-NEXT: s_add_co_i32 s1, s6, s38
; GFX12-NEXT: s_add_co_i32 s6, s11, s43
; GFX12-NEXT: s_add_co_i32 s7, s10, s42
; GFX12-NEXT: s_add_co_i32 s10, s15, s47
@@ -752,13 +753,13 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6
; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[2:3] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[2:3] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[2:3] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -792,11 +793,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX8-LABEL: v_add_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -804,68 +805,68 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_add_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_add_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -901,66 +902,66 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: v_add_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7b, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_add_imm_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_imm_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_imm_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_add_imm_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v0, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -993,11 +994,11 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX8-LABEL: add64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_add_u32 s0, s6, s0
-; GFX8-NEXT: s_addc_u32 s1, s7, s1
+; GFX8-NEXT: s_add_u32 s0, s6, s2
+; GFX8-NEXT: s_addc_u32 s1, s7, s3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -1035,10 +1036,10 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1050,9 +1051,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1090,15 +1091,15 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
;
; GFX8-LABEL: add64_sgpr_vgpr:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_u32 s0, s2, s4
-; GFX8-NEXT: s_addc_u32 s1, s3, s5
+; GFX8-NEXT: s_add_u32 s0, s6, s0
+; GFX8-NEXT: s_addc_u32 s1, s7, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -1138,16 +1139,16 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX11-LABEL: add64_sgpr_vgpr:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s2, s4
-; GFX11-NEXT: s_addc_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s6, s0
+; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1155,15 +1156,15 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX12-LABEL: add64_sgpr_vgpr:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1205,123 +1206,123 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: add64_in_branch:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX8-NEXT: s_cbranch_scc0 .LBB9_4
; GFX8-NEXT: ; %bb.1: ; %else
-; GFX8-NEXT: s_add_u32 s4, s4, s6
-; GFX8-NEXT: s_addc_u32 s5, s5, s7
-; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX8-NEXT: s_add_u32 s0, s8, s10
+; GFX8-NEXT: s_addc_u32 s1, s9, s11
+; GFX8-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX8-NEXT: s_cbranch_vccnz .LBB9_3
; GFX8-NEXT: .LBB9_2: ; %if
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: .LBB9_3: ; %endif
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
; GFX8-NEXT: .LBB9_4:
-; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX8-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX8-NEXT: s_branch .LBB9_2
;
; GFX9-LABEL: add64_in_branch:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_add_u32 s4, s4, s6
-; GFX9-NEXT: s_addc_u32 s5, s5, s7
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB9_3
; GFX9-NEXT: .LBB9_2: ; %if
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: .LBB9_3: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB9_4:
-; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX9-NEXT: s_branch .LBB9_2
;
; GFX10-LABEL: add64_in_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB9_4
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_add_u32 s4, s4, s6
-; GFX10-NEXT: s_addc_u32 s5, s5, s7
+; GFX10-NEXT: s_add_u32 s0, s8, s10
+; GFX10-NEXT: s_addc_u32 s1, s9, s11
; GFX10-NEXT: s_cbranch_execnz .LBB9_3
; GFX10-NEXT: .LBB9_2: ; %if
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: .LBB9_3: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB9_4:
-; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB9_2
;
; GFX11-LABEL: add64_in_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX11-NEXT: s_cbranch_scc0 .LBB9_4
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
; GFX11-NEXT: s_cbranch_execnz .LBB9_3
; GFX11-NEXT: .LBB9_2: ; %if
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX11-NEXT: .LBB9_3: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB9_4:
-; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX11-NEXT: s_branch .LBB9_2
;
; GFX12-LABEL: add64_in_branch:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX12-NEXT: s_cbranch_scc0 .LBB9_4
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[8:9], s[10:11]
; GFX12-NEXT: s_cbranch_execnz .LBB9_3
; GFX12-NEXT: .LBB9_2: ; %if
-; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: .LBB9_3: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
; GFX12-NEXT: .LBB9_4:
-; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX12-NEXT: s_branch .LBB9_2
entry:
%0 = icmp eq i64 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index be9b5b0..65b8db9 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -10,14 +10,14 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: v_test_add_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -66,13 +66,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
@@ -94,19 +94,19 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: s_test_add_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_lshr_b32 s3, s0, 16
-; VI-NEXT: s_add_i32 s2, s2, s0
-; VI-NEXT: s_add_i32 s1, s1, s3
-; VI-NEXT: s_and_b32 s0, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: s_lshr_b32 s3, s1, 16
+; VI-NEXT: s_add_i32 s0, s0, s1
+; VI-NEXT: s_add_i32 s2, s2, s3
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_lshl_b32 s1, s2, 16
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -144,13 +144,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v1, s2, s0
+; GFX11-NEXT: v_pk_add_u16 v1, s0, s1
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -165,54 +165,54 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
; VI-LABEL: s_test_add_self_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: s_and_b32 s1, s2, 0xffff
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_add_self_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_add_self_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_add_self_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -279,17 +279,17 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0x1c8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -298,38 +298,38 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_add_v2i16_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -346,17 +346,17 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -365,38 +365,38 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_add_v2i16_neg_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_neg_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_neg_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -412,17 +412,17 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, -1, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -431,37 +431,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -477,16 +477,16 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; VI-NEXT: v_add_u16_e32 v2, 32, v2
; VI-NEXT: v_or_b32_e32 v2, v2, v3
@@ -495,37 +495,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -542,17 +542,17 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_fp_split:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0x3f80
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -560,37 +560,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -608,14 +608,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -667,13 +667,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -700,14 +700,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -763,12 +763,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
@@ -796,14 +796,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -857,13 +857,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -890,14 +890,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@@ -957,13 +957,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index 330cf48..46379da 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX9-NEXT: s_cmp_lt_i32 s2, 1
; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB2_2: ; %then
@@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX10-NEXT: s_cmp_lt_i32 s2, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB2_2
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB2_2: ; %then
@@ -80,10 +80,10 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cbranch_scc0 .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB2_2: ; %then
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 77976e4..95f5947 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
+; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
+; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
ptr addrspace(1) inreg %out) {
%v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 559871d..e45acee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -475,12 +475,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
@@ -488,7 +488,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX9-NEXT: global_store_short v3, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -544,13 +545,13 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
;
; GFX9-LABEL: urem_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s3, s2, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_lshr_b32 s5, s4, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
@@ -559,10 +560,10 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -709,29 +710,28 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-LABEL: srem_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s5, s4, 16
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5
-; GFX9-NEXT: s_sext_i32_i16 s2, s4
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
+; GFX9-NEXT: s_sext_i32_i16 s0, s4
+; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX9-NEXT: s_xor_b32 s0, s0, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s6, s2, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s6, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s6, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -781,20 +781,20 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
;
; GFX9-LABEL: udiv_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4
; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1
; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv i8 %x, %y
store i8 %r, ptr addrspace(1) %out
@@ -849,13 +849,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
;
; GFX9-LABEL: urem_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2
-; GFX9-NEXT: s_lshr_b32 s3, s2, 8
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
+; GFX9-NEXT: s_lshr_b32 s0, s4, 8
; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
@@ -863,10 +863,9 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i8 %x, %y
store i8 %r, ptr addrspace(1) %out
@@ -1277,12 +1276,12 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-LABEL: udiv_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT: s_sub_i32 s2, 0, s8
+; GFX9-NEXT: s_sub_i32 s0, 0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -1290,40 +1289,40 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s2, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3
-; GFX9-NEXT: s_mul_i32 s3, s2, s8
-; GFX9-NEXT: s_sub_i32 s3, s4, s3
-; GFX9-NEXT: s_add_i32 s13, s2, 1
-; GFX9-NEXT: s_sub_i32 s4, s3, s8
-; GFX9-NEXT: s_cmp_ge_u32 s3, s8
-; GFX9-NEXT: s_cselect_b32 s2, s13, s2
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: s_add_i32 s4, s2, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s1, s0, s8
+; GFX9-NEXT: s_sub_i32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s13, s0, 1
+; GFX9-NEXT: s_sub_i32 s4, s1, s8
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s0, s13, s0
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s4, s0, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_cselect_b32 s2, s4, s2
-; GFX9-NEXT: s_sub_i32 s3, 0, s9
-; GFX9-NEXT: s_mul_i32 s3, s3, s12
-; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3
-; GFX9-NEXT: s_add_i32 s12, s12, s3
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s9
+; GFX9-NEXT: s_mul_i32 s1, s1, s12
+; GFX9-NEXT: s_mul_hi_u32 s1, s12, s1
+; GFX9-NEXT: s_add_i32 s12, s12, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12
-; GFX9-NEXT: s_mul_i32 s4, s3, s9
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s12
+; GFX9-NEXT: s_mul_i32 s4, s1, s9
; GFX9-NEXT: s_sub_i32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s8, s3, 1
+; GFX9-NEXT: s_add_i32 s8, s1, 1
; GFX9-NEXT: s_sub_i32 s5, s4, s9
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: s_cmp_ge_u32 s4, s9
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_cselect_b32 s3, s8, s3
+; GFX9-NEXT: s_cselect_b32 s1, s8, s1
; GFX9-NEXT: s_cselect_b32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s5, s3, 1
+; GFX9-NEXT: s_add_i32 s5, s1, 1
; GFX9-NEXT: s_cmp_ge_u32 s4, s9
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX9-NEXT: s_sub_i32 s4, 0, s10
@@ -1360,11 +1359,11 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_add_i32 s7, s5, 1
; GFX9-NEXT: s_cmp_ge_u32 s6, s11
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -1585,12 +1584,12 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-LABEL: urem_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT: s_sub_i32 s2, 0, s8
+; GFX9-NEXT: s_sub_i32 s0, 0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -1600,35 +1599,35 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s2, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3
-; GFX9-NEXT: s_mul_i32 s2, s2, s8
-; GFX9-NEXT: s_sub_i32 s2, s4, s2
-; GFX9-NEXT: s_sub_i32 s3, s2, s8
-; GFX9-NEXT: s_cmp_ge_u32 s2, s8
-; GFX9-NEXT: s_cselect_b32 s2, s3, s2
-; GFX9-NEXT: s_sub_i32 s3, s2, s8
-; GFX9-NEXT: s_cmp_ge_u32 s2, s8
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s0, s0, s8
+; GFX9-NEXT: s_sub_i32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s8
+; GFX9-NEXT: s_cmp_ge_u32 s0, s8
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s8
+; GFX9-NEXT: s_cmp_ge_u32 s0, s8
; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_cselect_b32 s2, s3, s2
-; GFX9-NEXT: s_sub_i32 s3, 0, s9
-; GFX9-NEXT: s_mul_i32 s3, s3, s12
-; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3
-; GFX9-NEXT: s_add_i32 s12, s12, s3
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12
-; GFX9-NEXT: s_mul_i32 s3, s3, s9
-; GFX9-NEXT: s_sub_i32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s4, s3, s9
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s9
+; GFX9-NEXT: s_mul_i32 s1, s1, s12
+; GFX9-NEXT: s_mul_hi_u32 s1, s12, s1
+; GFX9-NEXT: s_add_i32 s12, s12, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s12
+; GFX9-NEXT: s_mul_i32 s1, s1, s9
+; GFX9-NEXT: s_sub_i32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s9
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_cmp_ge_u32 s3, s9
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: s_sub_i32 s4, s3, s9
-; GFX9-NEXT: s_cmp_ge_u32 s3, s9
+; GFX9-NEXT: s_cmp_ge_u32 s1, s9
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s9
+; GFX9-NEXT: s_cmp_ge_u32 s1, s9
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
; GFX9-NEXT: s_sub_i32 s4, 0, s10
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s4, s4, s5
@@ -1660,11 +1659,11 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_sub_i32 s6, s5, s11
; GFX9-NEXT: s_cmp_ge_u32 s5, s11
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -1966,7 +1965,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_abs_i32 s2, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -1998,85 +1996,87 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_xor_b32 s8, s5, s9
; GFX9-NEXT: s_sub_i32 s9, 0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s12, s2, s3
; GFX9-NEXT: s_abs_i32 s5, s5
; GFX9-NEXT: s_ashr_i32 s8, s8, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s3
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9
-; GFX9-NEXT: s_add_i32 s3, s3, s9
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s9, s3, s4
-; GFX9-NEXT: s_sub_i32 s5, s5, s9
-; GFX9-NEXT: s_add_i32 s12, s3, 1
-; GFX9-NEXT: s_sub_i32 s9, s5, s4
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s12, s3
-; GFX9-NEXT: s_cselect_b32 s5, s9, s5
-; GFX9-NEXT: s_add_i32 s9, s3, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s9, s3
-; GFX9-NEXT: s_abs_i32 s4, s10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s8
-; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, s9
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s3, s2, s4
+; GFX9-NEXT: s_sub_i32 s3, s5, s3
+; GFX9-NEXT: s_add_i32 s9, s2, 1
+; GFX9-NEXT: s_sub_i32 s5, s3, s4
+; GFX9-NEXT: s_cmp_ge_u32 s3, s4
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
+; GFX9-NEXT: s_cselect_b32 s3, s5, s3
+; GFX9-NEXT: s_add_i32 s5, s2, 1
+; GFX9-NEXT: s_cmp_ge_u32 s3, s4
+; GFX9-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-NEXT: s_abs_i32 s3, s10
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s2, s8
+; GFX9-NEXT: s_xor_b32 s4, s6, s10
+; GFX9-NEXT: s_abs_i32 s5, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s5, s6, s10
-; GFX9-NEXT: s_abs_i32 s6, s6
-; GFX9-NEXT: s_ashr_i32 s5, s5, 31
+; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_sub_i32 s8, s2, s8
+; GFX9-NEXT: s_ashr_i32 s4, s4, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9
-; GFX9-NEXT: s_add_i32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8
-; GFX9-NEXT: s_mul_i32 s9, s8, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s9
-; GFX9-NEXT: s_add_i32 s10, s8, 1
-; GFX9-NEXT: s_sub_i32 s9, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s8, s10, s8
-; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_add_i32 s9, s8, 1
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s4, s9, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s2
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s6, s2, s3
+; GFX9-NEXT: s_sub_i32 s5, s5, s6
+; GFX9-NEXT: s_add_i32 s9, s2, 1
+; GFX9-NEXT: s_sub_i32 s6, s5, s3
+; GFX9-NEXT: s_cmp_ge_u32 s5, s3
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
+; GFX9-NEXT: s_cselect_b32 s5, s6, s5
+; GFX9-NEXT: s_add_i32 s6, s2, 1
+; GFX9-NEXT: s_cmp_ge_u32 s5, s3
+; GFX9-NEXT: s_cselect_b32 s5, s6, s2
; GFX9-NEXT: s_abs_i32 s6, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
-; GFX9-NEXT: s_xor_b32 s4, s4, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_xor_b32 s2, s7, s11
+; GFX9-NEXT: s_xor_b32 s5, s5, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_xor_b32 s0, s7, s11
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: s_abs_i32 s3, s7
+; GFX9-NEXT: s_abs_i32 s1, s7
; GFX9-NEXT: s_sub_i32 s7, 0, s6
-; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_ashr_i32 s0, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s7, s7, s5
; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
; GFX9-NEXT: s_mul_i32 s7, s5, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s7
+; GFX9-NEXT: s_sub_i32 s1, s1, s7
; GFX9-NEXT: s_add_i32 s8, s5, 1
-; GFX9-NEXT: s_sub_i32 s7, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
+; GFX9-NEXT: s_sub_i32 s7, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
; GFX9-NEXT: s_cselect_b32 s5, s8, s5
-; GFX9-NEXT: s_cselect_b32 s3, s7, s3
+; GFX9-NEXT: s_cselect_b32 s1, s7, s1
; GFX9-NEXT: s_add_i32 s7, s5, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s7, s5
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s7, s5
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = sdiv <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -2350,7 +2350,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_abs_i32 s2, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2377,78 +2376,80 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX9-NEXT: s_xor_b32 s2, s2, s3
; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s12, s2, s3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_ashr_i32 s8, s5, 31
; GFX9-NEXT: s_abs_i32 s5, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s3
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9
-; GFX9-NEXT: s_add_i32 s3, s3, s9
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_abs_i32 s4, s10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s8
-; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, s9
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s2, s2, s4
+; GFX9-NEXT: s_sub_i32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s3, s2, s4
+; GFX9-NEXT: s_cmp_ge_u32 s2, s4
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
+; GFX9-NEXT: s_sub_i32 s3, s2, s4
+; GFX9-NEXT: s_cmp_ge_u32 s2, s4
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
+; GFX9-NEXT: s_abs_i32 s3, s10
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s2, s8
+; GFX9-NEXT: s_ashr_i32 s4, s6, 31
+; GFX9-NEXT: s_abs_i32 s5, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_ashr_i32 s5, s6, 31
-; GFX9-NEXT: s_abs_i32 s6, s6
+; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_sub_i32 s8, s2, s8
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9
-; GFX9-NEXT: s_add_i32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8
-; GFX9-NEXT: s_mul_i32 s8, s8, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s8
-; GFX9-NEXT: s_sub_i32 s8, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s6, s8, s6
-; GFX9-NEXT: s_sub_i32 s8, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s4, s8, s6
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s2
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s5, s2, s3
+; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s5, s2, s3
+; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_cselect_b32 s5, s5, s2
; GFX9-NEXT: s_abs_i32 s6, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: s_xor_b32 s4, s4, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_ashr_i32 s2, s7, 31
+; GFX9-NEXT: s_xor_b32 s5, s5, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_abs_i32 s3, s7
+; GFX9-NEXT: s_abs_i32 s1, s7
; GFX9-NEXT: s_sub_i32 s7, 0, s6
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s7, s7, s5
; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s5
-; GFX9-NEXT: s_sub_i32 s5, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
+; GFX9-NEXT: s_sub_i32 s5, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s5, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -2604,7 +2605,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
@@ -2617,28 +2617,29 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT: s_and_b32 s2, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s5, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX9-NEXT: s_lshr_b32 s2, s7, 16
+; GFX9-NEXT: s_lshr_b32 s0, s7, 16
; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_lshr_b32 s2, s5, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT: s_lshr_b32 s0, s5, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -2654,7 +2655,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -2825,34 +2827,33 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-LABEL: urem_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT: s_and_b32 s9, s6, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT: s_and_b32 s8, s4, 0xffff
; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: s_and_b32 s3, s7, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s8, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s1, s5, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
@@ -2867,24 +2868,25 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1
-; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: v_sub_u32_e32 v1, s1, v2
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -3563,27 +3565,27 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
;
; GFX9-LABEL: urem_i3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: s_and_b32 s4, s2, 7
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
-; GFX9-NEXT: s_lshr_b32 s3, s2, 8
+; GFX9-NEXT: s_and_b32 s3, s4, 7
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s3
+; GFX9-NEXT: s_lshr_b32 s2, s4, 8
; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i3 %x, %y
store i3 %r, ptr addrspace(1) %out
@@ -3753,12 +3755,12 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
; GFX9-NEXT: s_cselect_b32 s2, s6, 0
; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem i3 %x, %y
store i3 %r, ptr addrspace(1) %out
@@ -3881,7 +3883,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
@@ -3894,19 +3895,20 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s2, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s5, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
@@ -3918,8 +3920,9 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v6, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v6, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v6, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4053,32 +4056,32 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-LABEL: urem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT: s_and_b32 s9, s6, 0xffff
; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s8, s4, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
-; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: s_and_b32 s3, s7, 0xffff
+; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3
-; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT: s_and_b32 s1, s5, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
@@ -4087,19 +4090,18 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4465,58 +4467,58 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-LABEL: srem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s8, s6
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8
; GFX9-NEXT: s_sext_i32_i16 s9, s4
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9
-; GFX9-NEXT: s_xor_b32 s2, s9, s8
+; GFX9-NEXT: s_xor_b32 s0, s9, s8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s10, s2, 1
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s10, s0, 1
; GFX9-NEXT: s_sext_i32_i16 s7, s7
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s10, 0
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s10, 0
; GFX9-NEXT: s_ashr_i32 s6, s6, 16
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6
; GFX9-NEXT: s_ashr_i32 s4, s4, 16
; GFX9-NEXT: s_sext_i32_i16 s5, s5
-; GFX9-NEXT: v_add_u32_e32 v1, s2, v2
+; GFX9-NEXT: v_add_u32_e32 v1, s0, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT: s_xor_b32 s2, s4, s6
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
+; GFX9-NEXT: s_xor_b32 s0, s4, s6
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
-; GFX9-NEXT: s_or_b32 s8, s2, 1
+; GFX9-NEXT: s_or_b32 s8, s0, 1
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s7
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s8, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s8, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v3
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
-; GFX9-NEXT: s_xor_b32 s2, s5, s7
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
+; GFX9-NEXT: s_xor_b32 s0, s5, s7
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT: s_or_b32 s6, s2, 1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v2, s2, v4
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b32 s6, s0, 1
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s6, 0
+; GFX9-NEXT: v_add_u32_e32 v2, s0, v4
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7
; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
; GFX9-NEXT: v_mov_b32_e32 v3, 0
@@ -4524,9 +4526,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4854,28 +4855,28 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
; GFX9-LABEL: urem_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30
-; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s7, s2, 0x7fff
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
-; GFX9-NEXT: s_bfe_u32 s2, s0, 0xf000f
+; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30
+; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30
+; GFX9-NEXT: s_bfe_u32 s3, s2, 0xf000f
+; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3
; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f
+; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
@@ -4892,11 +4893,11 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8
-; GFX9-NEXT: s_lshr_b32 s1, s0, 15
+; GFX9-NEXT: s_lshr_b32 s0, s2, 15
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2
; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3
; GFX9-NEXT: s_lshr_b32 s0, s6, 15
; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4
@@ -5717,54 +5718,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s0, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: s_mul_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7
-; GFX9-NEXT: s_mul_i32 s7, s6, s3
-; GFX9-NEXT: s_sub_i32 s4, s4, s7
-; GFX9-NEXT: s_add_i32 s9, s6, 1
-; GFX9-NEXT: s_sub_i32 s7, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
-; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
-; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s1, s0, s6
+; GFX9-NEXT: s_sub_i32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s9, s0, 1
+; GFX9-NEXT: s_sub_i32 s4, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s0, s9, s0
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s4, s0, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s3, s7, s6
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8
-; GFX9-NEXT: s_mul_i32 s6, s4, s2
-; GFX9-NEXT: s_sub_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s7, s4, 1
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
-; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_add_i32 s6, s4, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
-; GFX9-NEXT: s_cselect_b32 s2, s6, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s7
+; GFX9-NEXT: s_mul_i32 s1, s1, s8
+; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT: s_add_i32 s8, s8, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s8
+; GFX9-NEXT: s_mul_i32 s4, s1, s7
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
+; GFX9-NEXT: s_add_i32 s6, s1, 1
+; GFX9-NEXT: s_sub_i32 s5, s4, s7
+; GFX9-NEXT: s_cmp_ge_u32 s4, s7
+; GFX9-NEXT: s_cselect_b32 s1, s6, s1
+; GFX9-NEXT: s_cselect_b32 s4, s5, s4
+; GFX9-NEXT: s_add_i32 s5, s1, 1
+; GFX9-NEXT: s_cmp_ge_u32 s4, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = udiv <2 x i32> %x, %shl.y
@@ -6051,50 +6052,50 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s0, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: s_mul_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s3
-; GFX9-NEXT: s_sub_i32 s4, s4, s6
-; GFX9-NEXT: s_sub_i32 s6, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
-; GFX9-NEXT: s_cselect_b32 s4, s6, s4
-; GFX9-NEXT: s_sub_i32 s6, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s0, s0, s6
+; GFX9-NEXT: s_sub_i32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s6
+; GFX9-NEXT: s_cmp_ge_u32 s0, s6
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s6
+; GFX9-NEXT: s_cmp_ge_u32 s0, s6
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s3, s6, s4
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8
-; GFX9-NEXT: s_mul_i32 s4, s4, s2
-; GFX9-NEXT: s_sub_i32 s4, s5, s4
-; GFX9-NEXT: s_sub_i32 s5, s4, s2
-; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s4, s5, s4
-; GFX9-NEXT: s_sub_i32 s5, s4, s2
-; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s2, s5, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s7
+; GFX9-NEXT: s_mul_i32 s1, s1, s8
+; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT: s_add_i32 s8, s8, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s8
+; GFX9-NEXT: s_mul_i32 s1, s1, s7
+; GFX9-NEXT: s_sub_i32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = urem <2 x i32> %x, %shl.y
@@ -6546,65 +6547,66 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6
; GFX9-NEXT: s_abs_i32 s3, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s4, s2
; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7
; GFX9-NEXT: s_abs_i32 s7, s4
-; GFX9-NEXT: s_xor_b32 s2, s4, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s4, 0, s3
-; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_ashr_i32 s4, s2, 31
+; GFX9-NEXT: s_sub_i32 s2, 0, s3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8
-; GFX9-NEXT: s_mul_i32 s8, s4, s3
+; GFX9-NEXT: s_mul_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s2
+; GFX9-NEXT: s_mul_hi_u32 s2, s7, s8
+; GFX9-NEXT: s_mul_i32 s8, s2, s3
; GFX9-NEXT: s_sub_i32 s7, s7, s8
-; GFX9-NEXT: s_add_i32 s9, s4, 1
+; GFX9-NEXT: s_add_i32 s9, s2, 1
; GFX9-NEXT: s_sub_i32 s8, s7, s3
; GFX9-NEXT: s_cmp_ge_u32 s7, s3
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
; GFX9-NEXT: s_cselect_b32 s7, s8, s7
-; GFX9-NEXT: s_add_i32 s8, s4, 1
+; GFX9-NEXT: s_add_i32 s8, s2, 1
; GFX9-NEXT: s_cmp_ge_u32 s7, s3
-; GFX9-NEXT: s_cselect_b32 s3, s8, s4
-; GFX9-NEXT: s_abs_i32 s4, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s7, 0, s4
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_cselect_b32 s7, s8, s2
+; GFX9-NEXT: s_abs_i32 s8, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_xor_b32 s0, s5, s6
+; GFX9-NEXT: s_abs_i32 s1, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s6, s5, s6
-; GFX9-NEXT: s_abs_i32 s5, s5
-; GFX9-NEXT: s_ashr_i32 s6, s6, 31
+; GFX9-NEXT: s_xor_b32 s5, s7, s4
+; GFX9-NEXT: s_sub_i32 s6, 0, s8
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s3
-; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7
-; GFX9-NEXT: s_add_i32 s3, s3, s7
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s7, s3, s4
-; GFX9-NEXT: s_sub_i32 s5, s5, s7
-; GFX9-NEXT: s_add_i32 s8, s3, 1
-; GFX9-NEXT: s_sub_i32 s7, s5, s4
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s8, s3
+; GFX9-NEXT: s_ashr_i32 s0, s0, 31
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
+; GFX9-NEXT: s_add_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
+; GFX9-NEXT: s_mul_i32 s6, s5, s8
+; GFX9-NEXT: s_sub_i32 s1, s1, s6
+; GFX9-NEXT: s_add_i32 s7, s5, 1
+; GFX9-NEXT: s_sub_i32 s6, s1, s8
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
-; GFX9-NEXT: s_add_i32 s7, s3, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s7, s3
-; GFX9-NEXT: s_xor_b32 s3, s3, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s1, s6, s1
+; GFX9-NEXT: s_add_i32 s6, s5, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s1, s6, s5
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = sdiv <2 x i32> %x, %shl.y
@@ -6989,7 +6991,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6
; GFX9-NEXT: s_abs_i32 s2, s2
@@ -7013,35 +7014,37 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cselect_b32 s4, s7, s4
; GFX9-NEXT: s_sub_i32 s7, s4, s2
; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s2, s7, s4
-; GFX9-NEXT: s_abs_i32 s3, s3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_xor_b32 s2, s2, s6
-; GFX9-NEXT: s_sub_i32 s7, 0, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s6
+; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_abs_i32 s7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: s_xor_b32 s4, s4, s6
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s5, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_ashr_i32 s4, s5, 31
-; GFX9-NEXT: s_abs_i32 s5, s5
+; GFX9-NEXT: s_abs_i32 s1, s5
+; GFX9-NEXT: s_sub_i32 s5, 0, s7
+; GFX9-NEXT: s_sub_i32 s4, s4, s6
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7
-; GFX9-NEXT: s_add_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
-; GFX9-NEXT: s_mul_i32 s6, s6, s3
-; GFX9-NEXT: s_sub_i32 s5, s5, s6
-; GFX9-NEXT: s_sub_i32 s6, s5, s3
-; GFX9-NEXT: s_cmp_ge_u32 s5, s3
-; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s3
-; GFX9-NEXT: s_cmp_ge_u32 s5, s3
-; GFX9-NEXT: s_cselect_b32 s3, s6, s5
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT: s_add_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6
+; GFX9-NEXT: s_mul_i32 s5, s5, s7
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = srem <2 x i32> %x, %shl.y
@@ -7281,13 +7284,13 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: udiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 12
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = udiv i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -7614,18 +7617,18 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s8, 12
+; GFX9-NEXT: s_add_i32 s0, s8, 12
; GFX9-NEXT: s_add_i32 s8, s10, 12
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0
; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
%r = udiv <2 x i64> %x, %shl.y
@@ -7862,12 +7865,12 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: urem_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xfff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s6, 0xfff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = urem i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -8003,22 +8006,22 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10
+; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s10
; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8
; GFX9-NEXT: s_add_u32 s8, s8, -1
; GFX9-NEXT: s_addc_u32 s9, s9, -1
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT: s_add_u32 s2, s2, -1
-; GFX9-NEXT: s_addc_u32 s3, s3, -1
-; GFX9-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT: s_add_u32 s0, s0, -1
+; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
%r = urem <2 x i64> %x, %shl.y
@@ -8129,58 +8132,58 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: sdiv_i64_oddk_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT: s_add_u32 s4, 0x396, s4
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT: s_add_u32 s0, 0x396, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT: s_addc_u32 s5, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_addc_u32 s1, 0, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT: s_add_i32 s7, s7, s5
-; GFX9-NEXT: s_sub_i32 s5, s7, s6
-; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT: s_mul_i32 s12, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT: s_add_u32 s6, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT: s_add_i32 s3, s3, s1
+; GFX9-NEXT: s_sub_i32 s1, s3, s2
+; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT: s_mul_i32 s12, s2, s1
+; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT: s_add_u32 s2, s2, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT: s_mul_i32 s10, s0, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s11
-; GFX9-NEXT: s_add_u32 s6, s6, s10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT: s_addc_u32 s6, s8, s9
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s5, s6, s5
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT: s_add_u32 s2, s2, s10
+; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT: s_addc_u32 s2, s8, s9
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_mul_i32 s1, s0, s1
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: s_addc_u32 s2, 0, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s6, s4, s6
+; GFX9-NEXT: s_addc_u32 s8, s0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_mov_b32 s5, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, s4
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_add_u32 s2, s6, s0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_addc_u32 s3, s7, s0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: v_readfirstlane_b32 s9, v0
-; GFX9-NEXT: s_mul_i32 s8, s2, s6
+; GFX9-NEXT: s_mul_i32 s7, s2, s8
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
-; GFX9-NEXT: s_mul_hi_u32 s7, s2, s6
-; GFX9-NEXT: s_add_u32 s8, s10, s8
-; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s8
+; GFX9-NEXT: s_add_u32 s7, s10, s7
+; GFX9-NEXT: s_addc_u32 s6, 0, s6
; GFX9-NEXT: s_mul_hi_u32 s11, s3, s9
; GFX9-NEXT: s_mul_i32 s9, s3, s9
-; GFX9-NEXT: s_add_u32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s10, s3, s6
-; GFX9-NEXT: s_addc_u32 s7, s7, s11
-; GFX9-NEXT: s_addc_u32 s8, s10, 0
-; GFX9-NEXT: s_mul_i32 s6, s3, s6
-; GFX9-NEXT: s_add_u32 s6, s7, s6
-; GFX9-NEXT: s_addc_u32 s7, 0, s8
+; GFX9-NEXT: s_add_u32 s7, s7, s9
+; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX9-NEXT: s_addc_u32 s6, s6, s11
+; GFX9-NEXT: s_addc_u32 s7, s10, 0
+; GFX9-NEXT: s_mul_i32 s8, s3, s8
+; GFX9-NEXT: s_add_u32 s6, s6, s8
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
; GFX9-NEXT: s_add_u32 s8, s6, 1
; GFX9-NEXT: s_addc_u32 s9, s7, 0
; GFX9-NEXT: s_add_u32 s10, s6, 2
@@ -8213,13 +8216,13 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b32 s3, s3, s7
; GFX9-NEXT: s_cselect_b32 s2, s8, s6
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: s_sub_u32 s2, s2, s4
-; GFX9-NEXT: s_subb_u32 s3, s3, s4
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT: s_sub_u32 s2, s2, s0
+; GFX9-NEXT: s_subb_u32 s3, s3, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = sdiv i64 %x, 1235195
store i64 %r, ptr addrspace(1) %out
@@ -8252,17 +8255,17 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: sdiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_lshr_b32 s4, s4, 20
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
-; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_lshr_b32 s0, s0, 20
+; GFX9-NEXT: s_add_u32 s0, s6, s0
+; GFX9-NEXT: s_addc_u32 s1, s7, 0
+; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = sdiv i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -9518,100 +9521,100 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: srem_i64_oddk_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT: s_add_u32 s4, 0x396, s4
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT: s_add_u32 s0, 0x396, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT: s_addc_u32 s5, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_addc_u32 s1, 0, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT: s_add_i32 s7, s7, s5
-; GFX9-NEXT: s_sub_i32 s5, s7, s6
-; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT: s_mul_i32 s12, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT: s_add_u32 s6, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT: s_add_i32 s3, s3, s1
+; GFX9-NEXT: s_sub_i32 s1, s3, s2
+; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT: s_mul_i32 s12, s2, s1
+; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT: s_add_u32 s2, s2, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT: s_mul_i32 s10, s0, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s11
-; GFX9-NEXT: s_add_u32 s6, s6, s10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT: s_addc_u32 s6, s8, s9
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s5, s6, s5
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT: s_add_u32 s2, s2, s10
+; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT: s_addc_u32 s2, s8, s9
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_mul_i32 s1, s0, s1
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: s_addc_u32 s2, 0, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s6, s4, s6
+; GFX9-NEXT: s_addc_u32 s8, s0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_mov_b32 s5, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, s4
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s7, s2, s6
-; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT: s_add_u32 s7, s9, s7
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s7, s7, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s6
-; GFX9-NEXT: s_addc_u32 s5, s5, s10
-; GFX9-NEXT: s_addc_u32 s7, s9, 0
-; GFX9-NEXT: s_mul_i32 s6, s3, s6
-; GFX9-NEXT: s_add_u32 s5, s5, s6
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: s_mul_hi_u32 s8, s5, 0x12d8fb
-; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_add_u32 s2, s6, s0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_addc_u32 s3, s7, s0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: s_mul_i32 s6, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX9-NEXT: s_mul_hi_u32 s1, s2, s8
+; GFX9-NEXT: s_add_u32 s6, s9, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s10, s3, s7
+; GFX9-NEXT: s_mul_i32 s7, s3, s7
+; GFX9-NEXT: s_add_u32 s6, s6, s7
+; GFX9-NEXT: s_mul_hi_u32 s9, s3, s8
+; GFX9-NEXT: s_addc_u32 s1, s1, s10
+; GFX9-NEXT: s_addc_u32 s6, s9, 0
+; GFX9-NEXT: s_mul_i32 s7, s3, s8
+; GFX9-NEXT: s_add_u32 s1, s1, s7
+; GFX9-NEXT: s_addc_u32 s6, 0, s6
+; GFX9-NEXT: s_mul_hi_u32 s8, s1, 0x12d8fb
+; GFX9-NEXT: s_mul_i32 s1, s1, 0x12d8fb
; GFX9-NEXT: s_mul_i32 s6, s6, 0x12d8fb
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: s_mov_b32 s7, 0x12d8fb
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s2, s3, s8
+; GFX9-NEXT: s_subb_u32 s1, s3, s8
; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s7, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s3, s2, 0
+; GFX9-NEXT: s_subb_u32 s2, s1, 0
; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s7, v1
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s5, s3, 0
+; GFX9-NEXT: s_subb_u32 s3, s2, 0
; GFX9-NEXT: s_mov_b32 s6, 0x12d8fa
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT: s_cmp_eq_u32 s3, 0
+; GFX9-NEXT: s_cmp_eq_u32 s2, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cmp_eq_u32 s1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, s4, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = srem i64 %x, 1235195
store i64 %r, ptr addrspace(1) %out
@@ -9646,19 +9649,19 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: srem_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_lshr_b32 s4, s4, 20
-; GFX9-NEXT: s_add_u32 s4, s2, s4
-; GFX9-NEXT: s_addc_u32 s5, s3, 0
-; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000
-; GFX9-NEXT: s_sub_u32 s2, s2, s4
-; GFX9-NEXT: s_subb_u32 s3, s3, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_lshr_b32 s0, s0, 20
+; GFX9-NEXT: s_add_u32 s0, s6, s0
+; GFX9-NEXT: s_addc_u32 s1, s7, 0
+; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT: s_sub_u32 s0, s6, s0
+; GFX9-NEXT: s_subb_u32 s1, s7, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = srem i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index d613759..c623364 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; SI: s_load_dword [[B:s[0-9]+]]
; SI: s_load_dwordx2
; SI-NOT: and
-; SI: s_lshl_b32 [[A]], [[A]], 1
-; SI: s_lshl_b32 [[B]], [[B]], 1
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
-; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
+; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
+; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
@@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
+; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
; SI-NOT: and
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
+; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
; SI-NOT: and
; SI: s_add_u32
; SI-NEXT: s_addc_u32
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 897e134..4617a53 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -22,17 +22,17 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
;
; GFX8-LABEL: anyext_i1_i32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: anyext_i1_i32:
@@ -89,15 +89,15 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: s_anyext_i16_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 624101d..220fa5a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -63,13 +63,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -91,13 +91,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -119,13 +119,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -146,13 +147,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -175,14 +177,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -206,14 +208,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -238,14 +240,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -269,14 +271,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -336,14 +338,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -366,14 +368,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -396,13 +398,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -424,13 +427,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -454,14 +458,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -486,14 +490,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -519,14 +523,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -551,14 +555,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -612,13 +616,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -653,13 +657,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -693,13 +697,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -732,13 +737,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -774,14 +780,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -817,13 +823,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -861,14 +867,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -904,13 +910,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -969,13 +975,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB3_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1012,13 +1018,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB3_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: struct_add_i32_varying_vdata:
@@ -1055,13 +1061,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: .LBB3_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: struct_add_i32_varying_vdata:
@@ -1097,13 +1104,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: .LBB3_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: struct_add_i32_varying_vdata:
@@ -1142,14 +1150,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB3_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1188,13 +1196,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB3_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1235,14 +1243,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB3_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1281,13 +1289,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB3_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1315,12 +1323,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1328,51 +1336,54 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1431,14 +1442,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1460,14 +1471,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1489,14 +1500,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1517,14 +1529,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1547,15 +1560,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1579,15 +1592,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1612,15 +1625,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1644,15 +1657,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1712,14 +1725,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1742,14 +1755,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1772,14 +1785,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1801,14 +1814,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1832,15 +1845,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1865,15 +1878,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB6_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1899,15 +1912,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1932,15 +1945,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1994,13 +2007,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB7_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2035,13 +2048,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB7_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -2075,13 +2088,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB7_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -2114,13 +2128,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB7_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -2156,14 +2171,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB7_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2199,14 +2214,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB7_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -2244,14 +2259,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -2287,14 +2302,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -2322,12 +2337,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2335,51 +2350,54 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8ee0ee3..529af3d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -48,243 +48,243 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX89-LABEL: add_i32_constant:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b64 s[2:3], exec
+; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX89-NEXT: ; implicit-def: $vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX89-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX89-NEXT: s_cbranch_execz .LBB0_2
; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX89-NEXT: s_mul_i32 s2, s2, 5
; GFX89-NEXT: s_mov_b32 s11, 0xf000
; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, s2
; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
; GFX89-NEXT: .LBB0_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT: v_readfirstlane_b32 s4, v1
+; GFX89-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX89-NEXT: v_readfirstlane_b32 s0, v1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s5
+; GFX1132-NEXT: v_mov_b32_e32 v1, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: add_i32_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB0_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_mul_i32 s2, s2, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s2
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB0_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i32_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b32 s0, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB0_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_i32 s1, s1, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s5
+; GFX1232-NEXT: v_mov_b32_e32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB0_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -647,280 +647,280 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s8, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: v_readlane_b32 s6, v0, s4
; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
+; GFX8-NEXT: v_writelane_b32 v1, s8, m0
+; GFX8-NEXT: s_add_i32 s8, s8, s6
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB2_4
; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s15, 0xf000
+; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX8-NEXT: s_mov_b32 s12, s6
+; GFX8-NEXT: s_mov_b32 s13, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB2_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: v_readlane_b32 s6, v0, s4
; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
+; GFX9-NEXT: v_writelane_b32 v1, s8, m0
+; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB2_4
; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xf000
+; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB2_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_mov_b32 s8, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s8, s6
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
+; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB2_4
; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s8
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB2_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT: s_add_i32 s2, s2, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB2_4
; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB2_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_varying:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
+; GFX1164-NEXT: s_mov_b32 s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s8, s6
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
+; GFX1164-NEXT: s_add_i32 s8, s8, s7
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB2_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT: s_add_i32 s2, s2, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execz .LBB2_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132-NEXT: v_mov_b32_e32 v0, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -928,97 +928,97 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-LABEL: add_i32_varying:
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
+; GFX1264-NEXT: s_mov_b32 s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1264-NEXT: v_writelane_b32 v1, s8, s6
; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264-NEXT: s_add_co_i32 s8, s8, s7
; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1264-NEXT: s_cbranch_execz .LBB2_4
; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB2_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i32_varying:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_ctz_i32_b32 s4, s3
; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1232-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1232-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1232-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1232-NEXT: s_add_co_i32 s2, s2, s5
+; GFX1232-NEXT: s_cmp_lg_u32 s3, 0
; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1232-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1232-NEXT: s_cbranch_execz .LBB2_4
; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232-NEXT: v_mov_b32_e32 v0, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_mov_b32 s10, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB2_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -1071,260 +1071,259 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX89-LABEL: add_i64_constant:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b64 s[2:3], exec
+; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX89-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX89-NEXT: s_cbranch_execz .LBB3_2
; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX89-NEXT: s_mul_i32 s2, s2, 5
; GFX89-NEXT: s_mov_b32 s11, 0xf000
; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v0, s2
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
; GFX89-NEXT: .LBB3_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX89-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: v_readfirstlane_b32 s1, v1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_readfirstlane_b32 s2, v0
-; GFX89-NEXT: v_readfirstlane_b32 s3, v1
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
-; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: s_nop 2
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB3_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB3_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB3_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB3_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB3_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB3_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: add_i64_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB3_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
+; GFX1264-NEXT: s_mul_u64 s[2:3], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: v_mov_b32_e32 v0, s2
+; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB3_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i64_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s3
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[0:1], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB3_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -1383,21 +1382,21 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: add_i64_uniform:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_mov_b64 s[8:9], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB4_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s12, s6
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s1, s6
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s2, v0, 0
+; GFX8-NEXT: s_mul_i32 s6, s3, s6
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s13, s7
@@ -1406,14 +1405,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB4_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
@@ -1548,9 +1547,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1164-NEXT: s_mov_b64 s[8:9], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
@@ -1561,9 +1560,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s9, s1, s8
-; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1164-NEXT: s_mul_i32 s8, s0, s8
+; GFX1164-NEXT: s_mul_i32 s9, s3, s8
+; GFX1164-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1164-NEXT: s_mul_i32 s8, s2, s8
; GFX1164-NEXT: s_add_i32 s10, s10, s9
; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: v_mov_b32_e32 v1, s10
@@ -1575,15 +1574,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -1595,24 +1594,24 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB4_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s8
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s8, s1, s3
-; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1132-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132-NEXT: s_mul_i32 s8, s3, s1
+; GFX1132-NEXT: s_mul_hi_u32 s9, s2, s1
+; GFX1132-NEXT: s_mul_i32 s1, s2, s1
; GFX1132-NEXT: s_add_i32 s9, s9, s8
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s9
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_mov_b32 s8, s6
; GFX1132-NEXT: s_mov_b32 s9, s7
@@ -1621,15 +1620,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -1641,11 +1640,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1264-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1654,7 +1653,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[2:3], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
@@ -1665,15 +1664,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB4_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s3, v2, v[1:2]
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1683,22 +1682,22 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1232-NEXT: s_mov_b32 s9, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB4_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[2:3], s[0:1]
; GFX1232-NEXT: s_mov_b32 s14, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
@@ -1706,14 +1705,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB4_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s3, v2, v[1:2]
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1746,82 +1745,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX89-LABEL: add_i64_varying:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX10-LABEL: add_i64_varying:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s10, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s1, s5
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i64_varying:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i64_varying:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s1, s5
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1869,283 +1868,283 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB6_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX8-NEXT: s_mul_i32 s2, s2, 5
; GFX8-NEXT: s_mov_b32 s11, 0xf000
; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB6_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_mul_i32 s2, s2, 5
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB6_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB6_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB6_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB6_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s5
+; GFX1132-NEXT: v_mov_b32_e32 v1, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB6_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: sub_i32_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB6_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_mul_i32 s2, s2, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s2
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB6_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i32_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b32 s0, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB6_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_i32 s1, s1, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s5
+; GFX1232-NEXT: v_mov_b32_e32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB6_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -2514,280 +2513,280 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s8, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB8_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: v_readlane_b32 s6, v0, s4
; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
+; GFX8-NEXT: v_writelane_b32 v1, s8, m0
+; GFX8-NEXT: s_add_i32 s8, s8, s6
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB8_4
; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s15, 0xf000
+; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX8-NEXT: s_mov_b32 s12, s6
+; GFX8-NEXT: s_mov_b32 s13, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB8_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: v_readlane_b32 s6, v0, s4
; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
+; GFX9-NEXT: v_writelane_b32 v1, s8, m0
+; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB8_4
; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xf000
+; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB8_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_mov_b32 s8, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s8, s6
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
+; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB8_4
; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s8
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB8_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT: s_add_i32 s2, s2, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB8_4
; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB8_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_varying:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
+; GFX1164-NEXT: s_mov_b32 s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s8, s6
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
+; GFX1164-NEXT: s_add_i32 s8, s8, s7
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB8_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT: s_add_i32 s2, s2, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execz .LBB8_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132-NEXT: v_mov_b32_e32 v0, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2795,97 +2794,97 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-LABEL: sub_i32_varying:
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
+; GFX1264-NEXT: s_mov_b32 s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1264-NEXT: v_writelane_b32 v1, s8, s6
; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264-NEXT: s_add_co_i32 s8, s8, s7
; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1264-NEXT: s_cbranch_execz .LBB8_4
; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB8_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i32_varying:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_ctz_i32_b32 s4, s3
; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1232-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1232-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1232-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1232-NEXT: s_add_co_i32 s2, s2, s5
+; GFX1232-NEXT: s_cmp_lg_u32 s3, 0
; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1232-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1232-NEXT: s_cbranch_execz .LBB8_4
; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232-NEXT: v_mov_b32_e32 v0, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_mov_b32 s10, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB8_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -2938,317 +2937,313 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: sub_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB9_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX8-NEXT: s_mul_i32 s2, s2, 5
; GFX8-NEXT: s_mov_b32 s11, 0xf000
; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB9_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_mul_i32 s2, s2, 5
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB9_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB9_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB9_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: sub_i64_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB9_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
+; GFX1264-NEXT: s_mul_u64 s[2:3], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: v_mov_b32_e32 v0, s2
+; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB9_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
+; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i64_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s3
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[0:1], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB9_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -3307,21 +3302,21 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: sub_i64_uniform:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_mov_b64 s[8:9], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB10_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s12, s6
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s1, s6
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s2, v0, 0
+; GFX8-NEXT: s_mul_i32 s6, s3, s6
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s13, s7
@@ -3330,10 +3325,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB10_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
+; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
@@ -3481,9 +3476,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1164-NEXT: s_mov_b64 s[8:9], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
@@ -3494,9 +3489,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s9, s1, s8
-; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1164-NEXT: s_mul_i32 s8, s0, s8
+; GFX1164-NEXT: s_mul_i32 s9, s3, s8
+; GFX1164-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1164-NEXT: s_mul_i32 s8, s2, s8
; GFX1164-NEXT: s_add_i32 s10, s10, s9
; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: v_mov_b32_e32 v1, s10
@@ -3508,17 +3503,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB10_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -3530,24 +3525,24 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB10_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s8
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s8, s1, s3
-; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1132-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132-NEXT: s_mul_i32 s8, s3, s1
+; GFX1132-NEXT: s_mul_hi_u32 s9, s2, s1
+; GFX1132-NEXT: s_mul_i32 s1, s2, s1
; GFX1132-NEXT: s_add_i32 s9, s9, s8
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s9
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_mov_b32 s8, s6
; GFX1132-NEXT: s_mov_b32 s9, s7
@@ -3556,17 +3551,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB10_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -3578,11 +3573,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1264-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3591,7 +3586,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[2:3], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
@@ -3602,17 +3597,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB10_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s2, v2, 0
; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
-; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s3, v2, v[4:5]
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mov_b32_e32 v1, v4
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
@@ -3624,22 +3619,22 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1232-NEXT: s_mov_b32 s9, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB10_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[2:3], s[0:1]
; GFX1232-NEXT: s_mov_b32 s14, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
@@ -3648,15 +3643,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: .LBB10_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s2, v2, 0
; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
-; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s3, v2, v[4:5]
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232-NEXT: v_mov_b32_e32 v1, v4
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
@@ -3691,82 +3686,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX89-LABEL: sub_i64_varying:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i64_varying:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s10, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s1, s5
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i64_varying:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i64_varying:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s1, s5
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index af6f6913..98a28b2 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -64,13 +64,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_constant:
@@ -91,13 +91,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
@@ -120,13 +120,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_constant:
@@ -148,13 +149,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_constant:
@@ -178,14 +180,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -209,14 +211,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -281,14 +283,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
@@ -311,14 +313,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_uniform:
@@ -343,13 +345,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_uniform:
@@ -373,13 +376,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_uniform:
@@ -405,14 +409,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -438,14 +442,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
+; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -503,13 +507,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying:
@@ -543,13 +547,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_varying:
@@ -584,13 +588,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB2_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_varying:
@@ -624,13 +629,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB2_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_varying:
@@ -667,14 +673,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -711,14 +717,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -974,17 +980,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_nop 1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_constant:
@@ -1005,17 +1010,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_constant:
@@ -1038,14 +1042,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_constant:
@@ -1067,14 +1072,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_constant:
@@ -1098,15 +1104,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1131,15 +1137,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1196,228 +1202,229 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX8-LABEL: add_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB5_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s3, s8
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, 0
+; GFX8-NEXT: s_mul_i32 s2, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB5_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: v_mul_lo_u32 v3, s7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s3, s6
-; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_mul_i32 s3, s7, s2
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB5_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_nop 2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s7, s3, s6
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT: s_mul_i32 s6, s2, s6
-; GFX1064-NEXT: s_add_i32 s8, s8, s7
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: s_mul_i32 s3, s7, s2
+; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-NEXT: s_add_i32 s8, s8, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
-; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1]
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s6, s3, s5
-; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT: s_mul_i32 s5, s2, s5
-; GFX1032-NEXT: s_add_i32 s7, s7, s6
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: s_mul_i32 s2, s7, s1
+; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT: s_mul_i32 s1, s6, s1
+; GFX1032-NEXT: s_add_i32 s3, s3, s2
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
-; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1]
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2]
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s7, s3, s6
-; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1164-NEXT: s_mul_i32 s6, s2, s6
-; GFX1164-NEXT: s_add_i32 s8, s8, s7
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: s_mul_i32 s3, s7, s2
+; GFX1164-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_add_i32 s8, s8, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: v_mov_b32_e32 v1, s8
; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB5_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s6, v2, s[0:1]
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s7, v2, v[1:2]
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s6, s3, s5
-; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1132-NEXT: s_mul_i32 s5, s2, s5
-; GFX1132-NEXT: s_add_i32 s7, s7, s6
+; GFX1132-NEXT: s_mul_i32 s2, s7, s1
+; GFX1132-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1132-NEXT: s_mul_i32 s1, s6, s1
+; GFX1132-NEXT: s_add_i32 s3, s3, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s3
; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB5_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s6, v2, s[0:1]
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s7, v2, v[1:2]
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1447,51 +1454,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i64_varying:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1554,14 +1561,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB7_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
@@ -1582,14 +1589,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_constant:
@@ -1612,14 +1619,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_constant:
@@ -1641,14 +1649,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_constant:
@@ -1672,15 +1681,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB7_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1704,15 +1713,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB7_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1777,14 +1786,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB8_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
@@ -1807,14 +1816,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_uniform:
@@ -1839,14 +1848,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1064-NEXT: .LBB8_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_uniform:
@@ -1870,14 +1879,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1032-NEXT: .LBB8_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_uniform:
@@ -1903,15 +1912,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1937,15 +1946,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2003,13 +2012,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB9_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying:
@@ -2043,13 +2052,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB9_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_varying:
@@ -2084,13 +2093,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB9_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_varying:
@@ -2124,13 +2134,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB9_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_varying:
@@ -2167,14 +2178,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -2211,14 +2222,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2474,18 +2485,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_constant:
@@ -2506,18 +2517,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_constant:
@@ -2540,17 +2551,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB11_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_constant:
@@ -2572,17 +2584,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB11_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_constant:
@@ -2606,18 +2619,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB11_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -2642,18 +2655,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB11_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2710,241 +2723,241 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX8-LABEL: sub_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB12_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s3, s8
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, 0
+; GFX8-NEXT: s_mul_i32 s2, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB12_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
-; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
+; GFX8-NEXT: v_mul_lo_u32 v4, s7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v2, 0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s3, s6
-; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_mul_i32 s3, s7, s2
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB12_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v3
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s7, s3, s6
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT: s_mul_i32 s6, s2, s6
-; GFX1064-NEXT: s_add_i32 s8, s8, s7
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: s_mul_i32 s3, s7, s2
+; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-NEXT: s_add_i32 s8, s8, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB12_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3
+; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s6, s3, s5
-; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT: s_mul_i32 s5, s2, s5
-; GFX1032-NEXT: s_add_i32 s7, s7, s6
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: s_mul_i32 s2, s7, s1
+; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT: s_mul_i32 s1, s6, s1
+; GFX1032-NEXT: s_add_i32 s3, s3, s2
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB12_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
+; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s6, v2, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5]
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s7, s3, s6
-; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1164-NEXT: s_mul_i32 s6, s2, s6
-; GFX1164-NEXT: s_add_i32 s8, s8, s7
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: s_mul_i32 s3, s7, s2
+; GFX1164-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_add_i32 s8, s8, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: v_mov_b32_e32 v1, s8
; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB12_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s6, v2, 0
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s7, v2, v[4:5]
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s6, s3, s5
-; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1132-NEXT: s_mul_i32 s5, s2, s5
-; GFX1132-NEXT: s_add_i32 s7, s7, s6
+; GFX1132-NEXT: s_mul_i32 s2, s7, s1
+; GFX1132-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1132-NEXT: s_mul_i32 s1, s6, s1
+; GFX1132-NEXT: s_add_i32 s3, s3, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s3
; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB12_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s6, v2, 0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s7, v2, v[4:5]
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2974,51 +2987,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i64_varying:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3078,13 +3091,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB14_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: and_i32_varying:
@@ -3118,13 +3131,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB14_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_and_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: and_i32_varying:
@@ -3159,13 +3172,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB14_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: and_i32_varying:
@@ -3199,13 +3213,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB14_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: and_i32_varying:
@@ -3242,14 +3257,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB14_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3286,14 +3301,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB14_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3352,13 +3367,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB15_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: or_i32_varying:
@@ -3392,13 +3407,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB15_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_or_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: or_i32_varying:
@@ -3433,13 +3448,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB15_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: or_i32_varying:
@@ -3473,13 +3489,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB15_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: or_i32_varying:
@@ -3516,14 +3533,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB15_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3560,14 +3577,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB15_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3626,13 +3643,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB16_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: xor_i32_varying:
@@ -3666,13 +3683,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB16_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: xor_i32_varying:
@@ -3707,13 +3724,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB16_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: xor_i32_varying:
@@ -3747,13 +3765,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB16_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: xor_i32_varying:
@@ -3790,14 +3809,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB16_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3834,14 +3853,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB16_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3900,13 +3919,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB17_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_max_i32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i32_varying:
@@ -3940,13 +3959,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB17_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_max_i32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: max_i32_varying:
@@ -3981,13 +4000,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB17_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: max_i32_varying:
@@ -4021,13 +4041,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB17_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: max_i32_varying:
@@ -4064,14 +4085,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB17_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4108,14 +4129,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB17_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4180,21 +4201,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB18_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i64_constant:
@@ -4213,21 +4234,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB18_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: max_i64_constant:
@@ -4248,18 +4269,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB18_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: max_i64_constant:
@@ -4279,18 +4301,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB18_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: max_i64_constant:
@@ -4311,19 +4334,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB18_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4344,19 +4367,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB18_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4414,13 +4437,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB19_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_min_i32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i32_varying:
@@ -4454,13 +4477,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_min_i32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: min_i32_varying:
@@ -4495,13 +4518,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB19_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: min_i32_varying:
@@ -4535,13 +4559,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB19_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: min_i32_varying:
@@ -4578,14 +4603,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB19_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4622,14 +4647,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB19_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4694,21 +4719,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB20_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i64_constant:
@@ -4727,21 +4752,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB20_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: min_i64_constant:
@@ -4762,18 +4787,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB20_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: min_i64_constant:
@@ -4793,18 +4819,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB20_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: min_i64_constant:
@@ -4825,19 +4852,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB20_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4858,19 +4885,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB20_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4928,13 +4955,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB21_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_max_u32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i32_varying:
@@ -4968,13 +4995,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB21_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_max_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umax_i32_varying:
@@ -5009,13 +5036,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB21_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umax_i32_varying:
@@ -5049,13 +5077,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB21_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umax_i32_varying:
@@ -5092,14 +5121,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB21_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5136,14 +5165,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB21_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5207,20 +5236,20 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i64_constant:
@@ -5239,20 +5268,20 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umax_i64_constant:
@@ -5273,18 +5302,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB22_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umax_i64_constant:
@@ -5304,18 +5334,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB22_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umax_i64_constant:
@@ -5336,19 +5367,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB22_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5369,19 +5400,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB22_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5439,13 +5470,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB23_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_min_u32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i32_varying:
@@ -5479,13 +5510,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB23_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_min_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umin_i32_varying:
@@ -5520,13 +5551,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB23_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umin_i32_varying:
@@ -5560,13 +5592,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB23_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umin_i32_varying:
@@ -5603,14 +5636,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB23_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5647,14 +5680,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB23_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5718,20 +5751,20 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i64_constant:
@@ -5750,20 +5783,20 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umin_i64_constant:
@@ -5784,18 +5817,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB24_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umin_i64_constant:
@@ -5815,18 +5849,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB24_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umin_i64_constant:
@@ -5847,19 +5882,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB24_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5880,19 +5915,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB24_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index ca94d68..aa5c480 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -62,13 +62,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -90,13 +90,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -118,13 +118,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -145,13 +146,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -174,14 +176,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -205,14 +207,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -237,14 +239,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -268,14 +270,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -335,14 +337,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -365,14 +367,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -395,13 +397,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -423,13 +426,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -453,14 +457,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -485,14 +489,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -518,14 +522,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -550,14 +554,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -611,13 +615,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -652,13 +656,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -692,13 +696,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -731,13 +736,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -773,14 +779,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -816,13 +822,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -860,14 +866,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -903,13 +909,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -937,12 +943,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -950,51 +956,54 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1053,14 +1062,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1082,14 +1091,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1111,14 +1120,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB4_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1139,14 +1149,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB4_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1169,15 +1180,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB4_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1201,15 +1212,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB4_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1234,15 +1245,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB4_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1266,15 +1277,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB4_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1334,14 +1345,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1364,14 +1375,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1394,14 +1405,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1423,14 +1434,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1454,15 +1465,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1487,15 +1498,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1521,15 +1532,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1554,15 +1565,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1616,13 +1627,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1657,13 +1668,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1697,13 +1708,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB6_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1736,13 +1748,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB6_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1778,14 +1791,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1821,14 +1834,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB6_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1866,14 +1879,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1909,14 +1922,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1944,12 +1957,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1957,51 +1970,54 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 7e15c07..783c5d4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -64,13 +64,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -93,13 +93,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -122,13 +122,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -150,13 +151,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -180,14 +182,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -212,14 +214,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -245,14 +247,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -276,14 +278,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -345,14 +347,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -376,14 +378,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -407,13 +409,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -436,13 +439,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -467,14 +471,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -500,14 +504,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -534,14 +538,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -566,14 +570,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -629,13 +633,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -671,13 +675,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -712,13 +716,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -752,13 +757,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -795,14 +801,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -839,13 +845,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -884,14 +890,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -928,13 +934,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -962,12 +968,12 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_vindex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -975,51 +981,54 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_vindex:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_vindex:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_vindex:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_vindex:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1053,13 +1062,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: s_mov_b32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1070,13 +1078,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: s_mov_b32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
@@ -1085,13 +1093,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_offset:
@@ -1100,13 +1108,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W64-NEXT: s_mov_b32 s2, 0
; GFX11W64-NEXT: v_mov_b32_e32 v1, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, s2
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1118,41 +1126,43 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
; GFX11W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
;
; GFX12W64-LABEL: add_i32_varying_offset:
; GFX12W64: ; %bb.0: ; %entry
+; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: v_mov_b32_e32 v1, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
;
; GFX12W32-LABEL: add_i32_varying_offset:
; GFX12W32: ; %bb.0: ; %entry
+; GFX12W32-NEXT: s_clause 0x1
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
; GFX12W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1213,14 +1223,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1243,14 +1253,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1273,14 +1283,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1302,14 +1313,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1333,15 +1345,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1366,15 +1378,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1400,15 +1412,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1432,15 +1444,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1502,14 +1514,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1533,14 +1545,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1564,14 +1576,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1594,14 +1606,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1626,15 +1638,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB6_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1660,15 +1672,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB6_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1695,15 +1707,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1728,15 +1740,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1792,13 +1804,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB7_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1834,13 +1846,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB7_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1875,13 +1887,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB7_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1915,13 +1928,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB7_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1958,14 +1972,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB7_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2002,14 +2016,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB7_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -2048,14 +2062,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -2092,14 +2106,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -2127,12 +2141,12 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_vindex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2140,51 +2154,54 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_vindex:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_vindex:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_vindex:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_vindex:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2218,13 +2235,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: s_mov_b32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2235,13 +2251,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: s_mov_b32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
@@ -2250,13 +2266,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_offset:
@@ -2265,13 +2281,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W64-NEXT: s_mov_b32 s2, 0
; GFX11W64-NEXT: v_mov_b32_e32 v1, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, s2
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2283,41 +2299,43 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
; GFX11W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
;
; GFX12W64-LABEL: sub_i32_varying_offset:
; GFX12W64: ; %bb.0: ; %entry
+; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: v_mov_b32_e32 v1, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
;
; GFX12W32-LABEL: sub_i32_varying_offset:
; GFX12W32: ; %bb.0: ; %entry
+; GFX12W32-NEXT: s_clause 0x1
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
; GFX12W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index ad6009e..d74623a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -59,12 +59,12 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6
; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
@@ -73,12 +73,12 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
@@ -140,14 +140,14 @@ entry:
define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -156,12 +156,12 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -175,21 +175,21 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s2, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -201,21 +201,21 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s2, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -227,22 +227,22 @@ entry:
define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: ds_store_b32 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s4
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: ds_store_b32 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
index 0f20ed1..1b277c0 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -6,36 +6,36 @@
define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; VI-SDWA-LABEL: bfe_combine8:
; VI-SDWA: ; %bb.0:
-; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDWA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDWA-NEXT: flat_load_dword v2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDWA-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
; VI-SDWA-NEXT: s_waitcnt vmcnt(0)
; VI-SDWA-NEXT: flat_store_dword v[0:1], v2
; VI-SDWA-NEXT: s_endpgm
@@ -71,40 +71,40 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x)
define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_bfe_u32 v0, v0, 16, 16
; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; VI-SDWA-LABEL: bfe_combine16:
; VI-SDWA: ; %bb.0:
-; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDWA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0
; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v2, s1
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDWA-NEXT: v_mov_b32_e32 v2, s3
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; VI-SDWA-NEXT: flat_load_dword v2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDWA-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
; VI-SDWA-NEXT: s_waitcnt vmcnt(0)
; VI-SDWA-NEXT: flat_store_dword v[0:1], v2
; VI-SDWA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index 1639ec6..15cd6f7 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -23,18 +23,18 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_ubfe_sub_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_bfe_u32 v2, v3, 0, v4
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -78,18 +78,18 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_ubfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
@@ -221,18 +221,18 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_sbfe_sub_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_bfe_i32 v2, v3, 0, v4
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -276,18 +276,18 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_sbfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
@@ -418,14 +418,14 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out,
; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -463,16 +463,16 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou
; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s1, s2, 17
-; VI-NEXT: s_lshl_b32 s0, s0, 19
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_lshl_b32 s0, s0, 17
+; VI-NEXT: s_lshl_b32 s1, s1, 19
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_ashr_i32 s0, s0, 17
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -510,16 +510,16 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out,
; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s1, s2, 17
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_lshl_b32 s0, s0, 17
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_ashr_i32 s0, s0, 17
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 7b8eacc..31b5b16 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1426,11 +1426,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1442,11 +1442,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1457,11 +1457,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX8-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1473,11 +1473,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX10-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1514,11 +1514,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1530,11 +1530,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1545,11 +1545,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1561,11 +1561,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1602,11 +1602,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1618,11 +1618,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1633,11 +1633,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1649,11 +1649,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1691,12 +1691,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX8-LABEL: s_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1708,12 +1708,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1724,12 +1724,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1741,12 +1741,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index 8b2f66b..935909e 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -48,13 +48,13 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #
;
; VI-LABEL: s_bfm_pattern_simple:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfm_b32 s2, s2, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_bfm_b32 s0, s4, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%a = shl i32 1, %x
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 49ec09d..6c4791d 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -34,41 +34,41 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
;
; FLAT-LABEL: s_brev_i16:
; FLAT: ; %bb.0:
-; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b32 s4, s4
-; FLAT-NEXT: s_lshr_b32 s4, s4, 16
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0
+; FLAT-NEXT: s_brev_b32 s0, s2
+; FLAT-NEXT: s_lshr_b32 s0, s0, 16
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_i16:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_and_b32 s2, s2, 0xffff
-; GISEL-NEXT: s_brev_b32 s2, s2
-; GISEL-NEXT: s_lshr_b32 s2, s2, 16
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_and_b32 s0, s4, 0xffff
+; GISEL-NEXT: s_brev_b32 s0, s0
+; GISEL-NEXT: s_lshr_b32 s0, s0, 16
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
; GISEL-NEXT: flat_store_short v[0:1], v2
; GISEL-NEXT: s_endpgm
;
; GFX11-FLAT-LABEL: s_brev_i16:
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
-; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-FLAT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
+; GFX11-FLAT-NEXT: s_brev_b32 s0, s4
; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3]
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
@@ -76,17 +76,17 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
; GFX11-GISEL-LABEL: s_brev_i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
-; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-GISEL-NEXT: s_brev_b32 s0, s0
+; GFX11-GISEL-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -199,25 +199,25 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
;
; FLAT-LABEL: s_brev_i32:
; FLAT: ; %bb.0:
-; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b32 s4, s4
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; FLAT-NEXT: s_brev_b32 s0, s2
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_i32:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_brev_b32 s2, s2
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_brev_b32 s0, s4
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
; GISEL-NEXT: flat_store_dword v[0:1], v2
; GISEL-NEXT: s_endpgm
;
@@ -225,14 +225,14 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
+; GFX11-FLAT-NEXT: s_brev_b32 s0, s2
; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
-; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
@@ -240,14 +240,14 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
; GFX11-GISEL-LABEL: s_brev_i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
+; GFX11-GISEL-NEXT: s_brev_b32 s0, s4
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -702,17 +702,17 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
; FLAT-LABEL: s_brev_v2i64:
; FLAT: ; %bb.0:
; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s11, 0xf000
+; FLAT-NEXT: s_mov_b32 s10, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b64 s[6:7], s[6:7]
-; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5]
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: v_mov_b32_e32 v1, s5
-; FLAT-NEXT: v_mov_b32_e32 v2, s6
-; FLAT-NEXT: v_mov_b32_e32 v3, s7
-; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; FLAT-NEXT: s_brev_b64 s[0:1], s[6:7]
+; FLAT-NEXT: s_brev_b64 s[2:3], s[4:5]
+; FLAT-NEXT: v_mov_b32_e32 v0, s2
+; FLAT-NEXT: v_mov_b32_e32 v1, s3
+; FLAT-NEXT: v_mov_b32_e32 v2, s0
+; FLAT-NEXT: v_mov_b32_e32 v3, s1
+; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_v2i64:
@@ -735,15 +735,15 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FLAT-NEXT: s_mov_b32 s10, -1
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5]
-; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7]
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
-; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FLAT-NEXT: s_brev_b64 s[0:1], s[4:5]
+; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[6:7]
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 3dbbb87..8bee436 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -137,42 +137,42 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
;
; VI-LABEL: br_cc_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_cbranch_vccnz .LBB1_2
; VI-NEXT: ; %bb.1: ; %one
; VI-NEXT: v_mov_b32_e32 v0, 0x3800
; VI-NEXT: .LBB1_2: ; %two
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %one
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB1_2: ; %two
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mov_b32 s6, s2
+; GFX11-NEXT: s_mov_b32 s7, s3
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -221,44 +221,44 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
;
; VI-LABEL: br_cc_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ngt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_cbranch_vccnz .LBB2_2
; VI-NEXT: ; %bb.1: ; %one
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB2_2: ; %two
; VI-NEXT: v_mov_b32_e32 v0, 0x3800
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccz .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %two
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB2_2: ; %one
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mov_b32 s6, s2
+; GFX11-NEXT: s_mov_b32 s7, s3
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 384715a..b8d9878 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index e4c7df3..134e76c 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -34,29 +34,29 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v0, 0, s2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v0, 0, s6, v0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v0, 0, s2, 0x10203
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -87,31 +87,31 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s3, v0
-; VI-NEXT: v_perm_b32 v0, 0, s2, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v1, 0, s7, v0
+; VI-NEXT: v_perm_b32 v0, 0, s6, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v1, 0, s1, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -148,35 +148,35 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s11, v0
; VI-NEXT: v_perm_b32 v2, 0, s10, v0
; VI-NEXT: v_perm_b32 v1, 0, s9, v0
; VI-NEXT: v_perm_b32 v0, 0, s8, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v4i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v3, 0, s3, 0x10203
+; GFX11-NEXT: v_perm_b32 v2, 0, s2, 0x10203
+; GFX11-NEXT: v_perm_b32 v1, 0, s1, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -296,31 +296,31 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s2, v0
-; VI-NEXT: v_perm_b32 v0, 0, s3, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v1, 0, s6, v0
+; VI-NEXT: v_perm_b32 v0, 0, s7, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,35 +357,35 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s10, v0
; VI-NEXT: v_perm_b32 v2, 0, s11, v0
; VI-NEXT: v_perm_b32 v1, 0, s8, v0
; VI-NEXT: v_perm_b32 v0, 0, s9, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v2i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203
+; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203
+; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 8d347ae..04ee81b 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -19,12 +19,12 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector2:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 6
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -52,12 +52,12 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector2:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -80,14 +80,14 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector4:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 6
; GFX8-NEXT: v_mov_b32_e32 v2, 7
; GFX8-NEXT: v_mov_b32_e32 v3, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -119,14 +119,14 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector4:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: v_mov_b32_e32 v2, 7
; GFX940-NEXT: v_mov_b32_e32 v3, 8
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -146,11 +146,11 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector_v2i16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -176,11 +176,11 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector_v2i16:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
@@ -201,14 +201,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
;
; GFX8-LABEL: build_vector_v2i16_trunc:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s2, s2, 16
-; GFX8-NEXT: s_or_b32 s2, s2, 0x50000
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_lshr_b32 s0, s4, 16
+; GFX8-NEXT: s_or_b32 s0, s0, 0x50000
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 00af922..d5a9607 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1357,29 +1357,29 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v8i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s1, 24
-; VI-NEXT: s_lshr_b32 s3, s1, 16
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: s_add_i32 s2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_lshr_b32 s0, s3, 24
+; VI-NEXT: s_lshr_b32 s1, s3, 16
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_add_i32 s3, s3, s3
+; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_lshr_b32 s4, s0, 24
-; VI-NEXT: s_lshr_b32 s5, s0, 16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_lshr_b32 s4, s2, 24
+; VI-NEXT: s_lshr_b32 s5, s2, 16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_add_i32 s2, s2, s2
; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
@@ -1392,20 +1392,20 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v8i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s0, 24
-; GFX11-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-NEXT: s_lshr_b32 s5, s1, 24
-; GFX11-NEXT: v_add_nc_u16 v2, s1, s1
-; GFX11-NEXT: v_add_nc_u16 v3, s0, s0
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s2
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX11-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11-NEXT: s_lshr_b32 s1, s2, 24
+; GFX11-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11-NEXT: s_lshr_b32 s5, s3, 24
+; GFX11-NEXT: v_add_nc_u16 v2, s3, s3
+; GFX11-NEXT: v_add_nc_u16 v3, s2, s2
; GFX11-NEXT: v_add_nc_u16 v4, s5, s5
; GFX11-NEXT: v_add_nc_u16 v5, s4, s4
-; GFX11-NEXT: v_add_nc_u16 v6, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v7, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v6, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v7, s0, s0
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -1524,58 +1524,58 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v16i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s3, 24
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: s_add_i32 s5, s5, s5
-; VI-NEXT: s_add_i32 s4, s4, s4
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: s_lshr_b32 s6, s2, 24
-; VI-NEXT: s_lshr_b32 s7, s2, 16
+; VI-NEXT: s_lshr_b32 s0, s7, 24
+; VI-NEXT: s_lshr_b32 s1, s7, 16
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_lshr_b32 s2, s6, 24
+; VI-NEXT: s_lshr_b32 s3, s6, 16
; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: s_add_i32 s7, s7, s7
-; VI-NEXT: s_add_i32 s6, s6, s6
-; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_add_i32 s6, s6, s6
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s6
-; VI-NEXT: v_mov_b32_e32 v5, s7
-; VI-NEXT: s_lshr_b32 s8, s1, 24
-; VI-NEXT: s_lshr_b32 s9, s1, 16
+; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshr_b32 s8, s5, 24
+; VI-NEXT: s_lshr_b32 s9, s5, 16
; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s2
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: s_add_i32 s9, s9, s9
; VI-NEXT: s_add_i32 s8, s8, s8
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s8
; VI-NEXT: v_mov_b32_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s10, s0, 24
-; VI-NEXT: s_lshr_b32 s11, s0, 16
+; VI-NEXT: s_lshr_b32 s10, s4, 24
+; VI-NEXT: s_lshr_b32 s11, s4, 16
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_i32 s11, s11, s11
; VI-NEXT: s_add_i32 s10, s10, s10
; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s10
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
@@ -1585,36 +1585,36 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v16i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s10, s3, 16
-; GFX11-NEXT: s_lshr_b32 s11, s3, 24
-; GFX11-NEXT: v_lshrrev_b16 v2, 8, s2
-; GFX11-NEXT: v_lshrrev_b16 v3, 8, s3
+; GFX11-NEXT: s_lshr_b32 s10, s7, 16
+; GFX11-NEXT: s_lshr_b32 s11, s7, 24
+; GFX11-NEXT: v_lshrrev_b16 v2, 8, s6
+; GFX11-NEXT: v_lshrrev_b16 v3, 8, s7
; GFX11-NEXT: v_add_nc_u16 v7, s11, s11
; GFX11-NEXT: v_add_nc_u16 v8, s10, s10
-; GFX11-NEXT: v_add_nc_u16 v4, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v5, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v4, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v5, s6, s6
; GFX11-NEXT: v_add_nc_u16 v3, v3, v3
; GFX11-NEXT: v_add_nc_u16 v2, v2, v2
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT: s_lshr_b32 s7, s1, 24
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT: v_add_nc_u16 v11, s7, s7
+; GFX11-NEXT: s_lshr_b32 s3, s5, 24
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s5
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s4
+; GFX11-NEXT: v_add_nc_u16 v11, s3, s3
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2
; GFX11-NEXT: v_or_b32_e32 v7, v8, v7
-; GFX11-NEXT: s_lshr_b32 s6, s1, 16
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 24
-; GFX11-NEXT: s_lshr_b32 s8, s2, 16
-; GFX11-NEXT: s_lshr_b32 s9, s2, 24
-; GFX11-NEXT: v_add_nc_u16 v6, s1, s1
-; GFX11-NEXT: v_add_nc_u16 v12, s6, s6
+; GFX11-NEXT: s_lshr_b32 s2, s5, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
+; GFX11-NEXT: s_lshr_b32 s1, s4, 24
+; GFX11-NEXT: s_lshr_b32 s8, s6, 16
+; GFX11-NEXT: s_lshr_b32 s9, s6, 24
+; GFX11-NEXT: v_add_nc_u16 v6, s5, s5
+; GFX11-NEXT: v_add_nc_u16 v12, s2, s2
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
; GFX11-NEXT: v_add_nc_u16 v9, s9, s9
; GFX11-NEXT: v_add_nc_u16 v10, s8, s8
@@ -1622,10 +1622,10 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-NEXT: v_lshlrev_b16 v4, 8, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_add_nc_u16 v7, s0, s0
+; GFX11-NEXT: v_add_nc_u16 v7, s4, s4
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT: v_add_nc_u16 v8, s5, s5
-; GFX11-NEXT: v_add_nc_u16 v11, s4, s4
+; GFX11-NEXT: v_add_nc_u16 v8, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v11, s0, s0
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
@@ -1816,112 +1816,112 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v32i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v10, 0
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s8, s3, 24
-; VI-NEXT: s_lshr_b32 s9, s3, 16
-; VI-NEXT: s_add_i32 s9, s9, s9
-; VI-NEXT: s_add_i32 s8, s8, s8
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s8
-; VI-NEXT: v_mov_b32_e32 v9, s9
-; VI-NEXT: s_lshr_b32 s10, s2, 24
-; VI-NEXT: s_lshr_b32 s11, s2, 16
+; VI-NEXT: s_lshr_b32 s0, s7, 24
+; VI-NEXT: s_lshr_b32 s1, s7, 16
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s0
+; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: s_lshr_b32 s2, s6, 24
+; VI-NEXT: s_lshr_b32 s3, s6, 16
; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s3
-; VI-NEXT: s_add_i32 s11, s11, s11
-; VI-NEXT: s_add_i32 s10, s10, s10
-; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_add_i32 s6, s6, s6
; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s10
-; VI-NEXT: v_mov_b32_e32 v9, s11
-; VI-NEXT: s_lshr_b32 s12, s1, 24
-; VI-NEXT: s_lshr_b32 s13, s1, 16
+; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s2
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: s_lshr_b32 s12, s5, 24
+; VI-NEXT: s_lshr_b32 s13, s5, 16
; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s2
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v9, s6
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_add_i32 s13, s13, s13
; VI-NEXT: s_add_i32 s12, s12, s12
; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_sdwa v4, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s12
; VI-NEXT: v_mov_b32_e32 v9, s13
-; VI-NEXT: s_lshr_b32 s14, s0, 24
-; VI-NEXT: s_lshr_b32 s15, s0, 16
+; VI-NEXT: s_lshr_b32 s14, s4, 24
+; VI-NEXT: s_lshr_b32 s15, s4, 16
; VI-NEXT: v_add_u32_sdwa v5, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: v_mov_b32_e32 v9, s5
; VI-NEXT: v_add_u32_sdwa v6, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: s_add_i32 s15, s15, s15
; VI-NEXT: s_add_i32 s14, s14, s14
; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_sdwa v7, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s14
; VI-NEXT: v_mov_b32_e32 v9, s15
-; VI-NEXT: s_lshr_b32 s16, s7, 24
-; VI-NEXT: s_lshr_b32 s17, s7, 16
+; VI-NEXT: s_lshr_b32 s16, s11, 24
+; VI-NEXT: s_lshr_b32 s17, s11, 16
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s0
+; VI-NEXT: v_mov_b32_e32 v9, s4
; VI-NEXT: s_add_i32 s17, s17, s17
; VI-NEXT: s_add_i32 s16, s16, s16
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: s_add_i32 s11, s11, s11
; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s16
; VI-NEXT: v_mov_b32_e32 v9, s17
-; VI-NEXT: s_lshr_b32 s18, s6, 24
-; VI-NEXT: s_lshr_b32 s19, s6, 16
+; VI-NEXT: s_lshr_b32 s18, s10, 24
+; VI-NEXT: s_lshr_b32 s19, s10, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: v_mov_b32_e32 v9, s11
; VI-NEXT: s_add_i32 s19, s19, s19
; VI-NEXT: s_add_i32 s18, s18, s18
; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s6, s6, s6
+; VI-NEXT: s_add_i32 s10, s10, s10
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s18
; VI-NEXT: v_mov_b32_e32 v9, s19
-; VI-NEXT: s_lshr_b32 s20, s5, 24
-; VI-NEXT: s_lshr_b32 s21, s5, 16
+; VI-NEXT: s_lshr_b32 s20, s9, 24
+; VI-NEXT: s_lshr_b32 s21, s9, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s6
+; VI-NEXT: v_mov_b32_e32 v9, s10
; VI-NEXT: s_add_i32 s21, s21, s21
; VI-NEXT: s_add_i32 s20, s20, s20
; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s5, s5, s5
+; VI-NEXT: s_add_i32 s9, s9, s9
; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s20
; VI-NEXT: v_mov_b32_e32 v9, s21
-; VI-NEXT: s_lshr_b32 s22, s4, 24
-; VI-NEXT: s_lshr_b32 s23, s4, 16
+; VI-NEXT: s_lshr_b32 s22, s8, 24
+; VI-NEXT: s_lshr_b32 s23, s8, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s5
+; VI-NEXT: v_mov_b32_e32 v9, s9
; VI-NEXT: s_add_i32 s23, s23, s23
; VI-NEXT: s_add_i32 s22, s22, s22
; VI-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s4, s4, s4
+; VI-NEXT: s_add_i32 s8, s8, s8
; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s22
; VI-NEXT: v_mov_b32_e32 v9, s23
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s4
+; VI-NEXT: v_mov_b32_e32 v9, s8
; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v8, 16
@@ -1932,39 +1932,39 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v32i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2
-; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3
-; GFX11-NEXT: s_lshr_b32 s21, s3, 16
-; GFX11-NEXT: s_lshr_b32 s22, s3, 24
-; GFX11-NEXT: v_add_nc_u16 v8, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v9, s2, s2
+; GFX11-NEXT: v_lshrrev_b16 v3, 8, s6
+; GFX11-NEXT: v_lshrrev_b16 v7, 8, s7
+; GFX11-NEXT: s_lshr_b32 s21, s7, 16
+; GFX11-NEXT: s_lshr_b32 s22, s7, 24
+; GFX11-NEXT: v_add_nc_u16 v8, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v9, s6, s6
; GFX11-NEXT: v_add_nc_u16 v7, v7, v7
; GFX11-NEXT: v_add_nc_u16 v10, s22, s22
; GFX11-NEXT: v_add_nc_u16 v11, s21, s21
; GFX11-NEXT: v_add_nc_u16 v3, v3, v3
-; GFX11-NEXT: v_lshrrev_b16 v2, 8, s1
+; GFX11-NEXT: v_lshrrev_b16 v2, 8, s5
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v10
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT: s_lshr_b32 s18, s1, 16
-; GFX11-NEXT: s_lshr_b32 s19, s1, 24
-; GFX11-NEXT: s_lshr_b32 s20, s2, 24
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_lshr_b32 s18, s5, 16
+; GFX11-NEXT: s_lshr_b32 s19, s5, 24
+; GFX11-NEXT: s_lshr_b32 s20, s6, 24
+; GFX11-NEXT: s_lshr_b32 s6, s6, 16
; GFX11-NEXT: v_or_b32_e32 v7, v8, v7
; GFX11-NEXT: v_add_nc_u16 v8, s20, s20
; GFX11-NEXT: v_or_b32_e32 v10, v11, v10
; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u16 v9, s2, s2
-; GFX11-NEXT: v_add_nc_u16 v11, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v9, s6, s6
+; GFX11-NEXT: v_add_nc_u16 v11, s5, s5
; GFX11-NEXT: v_add_nc_u16 v2, v2, v2
; GFX11-NEXT: v_add_nc_u16 v12, s19, s19
; GFX11-NEXT: v_add_nc_u16 v13, s18, s18
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s4
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
@@ -1974,10 +1974,10 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v3
; GFX11-NEXT: v_or_b32_e32 v3, v9, v8
; GFX11-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-NEXT: v_add_nc_u16 v9, s0, s0
+; GFX11-NEXT: v_add_nc_u16 v9, s4, s4
; GFX11-NEXT: v_or_b32_e32 v8, v13, v12
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
-; GFX11-NEXT: v_lshrrev_b16 v6, 8, s7
+; GFX11-NEXT: v_lshrrev_b16 v6, 8, s11
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v3
@@ -1985,14 +1985,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v13, 8, v1
-; GFX11-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX11-NEXT: s_lshr_b32 s14, s7, 16
-; GFX11-NEXT: s_lshr_b32 s15, s7, 24
-; GFX11-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-NEXT: s_lshr_b32 s17, s0, 24
+; GFX11-NEXT: v_lshrrev_b16 v5, 8, s10
+; GFX11-NEXT: s_lshr_b32 s14, s11, 16
+; GFX11-NEXT: s_lshr_b32 s15, s11, 24
+; GFX11-NEXT: s_lshr_b32 s16, s4, 16
+; GFX11-NEXT: s_lshr_b32 s17, s4, 24
; GFX11-NEXT: v_or_b32_e32 v3, v7, v10
; GFX11-NEXT: v_or_b32_e32 v2, v14, v11
-; GFX11-NEXT: v_add_nc_u16 v7, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v7, s11, s11
; GFX11-NEXT: v_or_b32_e32 v1, v12, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v13
; GFX11-NEXT: v_add_nc_u16 v9, s17, s17
@@ -2000,7 +2000,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_add_nc_u16 v6, v6, v6
; GFX11-NEXT: v_add_nc_u16 v11, s15, s15
; GFX11-NEXT: v_add_nc_u16 v12, s14, s14
-; GFX11-NEXT: v_add_nc_u16 v13, s6, s6
+; GFX11-NEXT: v_add_nc_u16 v13, s10, s10
; GFX11-NEXT: v_add_nc_u16 v5, v5, v5
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-NEXT: v_lshlrev_b16 v6, 8, v6
@@ -2008,16 +2008,16 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s4
-; GFX11-NEXT: v_lshrrev_b16 v4, 8, s5
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s8
+; GFX11-NEXT: v_lshrrev_b16 v4, 8, s9
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT: s_lshr_b32 s12, s6, 16
-; GFX11-NEXT: s_lshr_b32 s13, s6, 24
-; GFX11-NEXT: s_lshr_b32 s8, s4, 16
-; GFX11-NEXT: s_lshr_b32 s9, s4, 24
-; GFX11-NEXT: s_lshr_b32 s10, s5, 16
-; GFX11-NEXT: s_lshr_b32 s11, s5, 24
+; GFX11-NEXT: s_lshr_b32 s12, s10, 16
+; GFX11-NEXT: s_lshr_b32 s13, s10, 24
+; GFX11-NEXT: s_lshr_b32 s0, s8, 16
+; GFX11-NEXT: s_lshr_b32 s1, s8, 24
+; GFX11-NEXT: s_lshr_b32 s2, s9, 16
+; GFX11-NEXT: s_lshr_b32 s3, s9, 24
; GFX11-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-NEXT: v_or_b32_e32 v7, v12, v11
; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v8
@@ -2025,14 +2025,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_add_nc_u16 v9, s13, s13
; GFX11-NEXT: v_add_nc_u16 v10, s12, s12
; GFX11-NEXT: v_or_b32_e32 v5, v13, v5
-; GFX11-NEXT: v_add_nc_u16 v11, s5, s5
+; GFX11-NEXT: v_add_nc_u16 v11, s9, s9
; GFX11-NEXT: v_add_nc_u16 v4, v4, v4
-; GFX11-NEXT: v_add_nc_u16 v13, s11, s11
-; GFX11-NEXT: v_add_nc_u16 v14, s10, s10
-; GFX11-NEXT: v_add_nc_u16 v15, s4, s4
+; GFX11-NEXT: v_add_nc_u16 v13, s3, s3
+; GFX11-NEXT: v_add_nc_u16 v14, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v15, s8, s8
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT: v_add_nc_u16 v16, s9, s9
-; GFX11-NEXT: v_add_nc_u16 v17, s8, s8
+; GFX11-NEXT: v_add_nc_u16 v16, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v17, s0, s0
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index a0499ef..8ad4535 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -35,11 +35,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; VI-LABEL: sadd64rr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_add_u32 s0, s6, s0
-; VI-NEXT: s_addc_u32 s1, s7, s1
+; VI-NEXT: s_add_u32 s0, s6, s2
+; VI-NEXT: s_addc_u32 s1, s7, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -77,11 +77,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W32: ; %bb.0: ; %entry
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s0, s6, s0
-; GFX1030W32-NEXT: s_addc_u32 s1, s7, s1
+; GFX1030W32-NEXT: s_add_u32 s0, s6, s2
+; GFX1030W32-NEXT: s_addc_u32 s1, s7, s3
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -91,11 +91,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W64: ; %bb.0: ; %entry
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s0, s6, s0
-; GFX1030W64-NEXT: s_addc_u32 s1, s7, s1
+; GFX1030W64-NEXT: s_add_u32 s0, s6, s2
+; GFX1030W64-NEXT: s_addc_u32 s1, s7, s3
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -105,10 +105,10 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -144,74 +144,74 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: sadd64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s2, 0x56789876
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s3, 0x1234
+; VI-NEXT: s_add_u32 s0, s6, 0x56789876
+; VI-NEXT: s_addc_u32 s1, s7, 0x1234
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: sadd64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX9-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: sadd64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1010-NEXT: v_mov_b32_e32 v0, s2
-; GFX1010-NEXT: v_mov_b32_e32 v1, s3
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1010-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: sadd64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1030W32-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1030W32-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: sadd64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1030W64-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1030W64-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: sadd64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX11-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -243,66 +243,66 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: vadd64rr:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vadd64rr:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64rr:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: v_add_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64rr:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, s6, v0
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vadd64rr:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], s6, v0
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vadd64rr:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -334,65 +334,66 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
;
; VI-LABEL: vadd64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x1234
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vadd64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1010-NEXT: s_mov_b32 null, 0
+; GFX1010-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
+; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0x1234, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vadd64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3]
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[0:1]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vadd64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -424,12 +425,12 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: suaddo32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -460,36 +461,36 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_i32 s2, s2, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W32-NEXT: s_add_i32 s0, s2, s3
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: suaddo32:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_i32 s2, s2, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W64-NEXT: s_add_i32 s0, s2, s3
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: suaddo32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_add_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,12 +535,12 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: uaddo32_vcc_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -576,42 +577,42 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1030W32-LABEL: uaddo32_vcc_user:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT: v_add_co_u32 v1, s0, s2, s3
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W32-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: uaddo32_vcc_user:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT: v_add_co_u32 v1, s[0:1], s2, s3
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W64-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: uaddo32_vcc_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5
+; GFX11-NEXT: v_add_co_u32 v1, s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -655,19 +656,19 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; VI-LABEL: suaddo64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -675,80 +676,80 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: suaddo64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: suaddo64:
; GFX1010: ; %bb.0:
-; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_add_u32 s6, s4, s6
-; GFX1010-NEXT: s_addc_u32 s7, s5, s7
-; GFX1010-NEXT: v_mov_b32_e32 v0, s6
-; GFX1010-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT: v_mov_b32_e32 v1, s7
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT: s_add_u32 s0, s8, s10
+; GFX1010-NEXT: s_addc_u32 s1, s9, s11
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: suaddo64:
; GFX1030W32: ; %bb.0:
-; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: s_add_u32 s0, s8, s10
+; GFX1030W32-NEXT: s_addc_u32 s1, s9, s11
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: suaddo64:
; GFX1030W64: ; %bb.0:
-; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: s_add_u32 s0, s8, s10
+; GFX1030W64-NEXT: s_addc_u32 s1, s9, s11
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], s[8:9]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: suaddo64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s6, s4, s6
-; GFX11-NEXT: s_addc_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -792,13 +793,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: vuaddo64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v6, s1
-; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[5:6]
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
@@ -840,48 +841,48 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-LABEL: vuaddo64:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vuaddo64:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
-; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vuaddo64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -920,11 +921,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; VI-LABEL: ssub64rr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_sub_u32 s0, s6, s0
-; VI-NEXT: s_subb_u32 s1, s7, s1
+; VI-NEXT: s_sub_u32 s0, s6, s2
+; VI-NEXT: s_subb_u32 s1, s7, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -962,11 +963,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W32: ; %bb.0: ; %entry
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0
-; GFX1030W32-NEXT: s_subb_u32 s1, s7, s1
+; GFX1030W32-NEXT: s_sub_u32 s0, s6, s2
+; GFX1030W32-NEXT: s_subb_u32 s1, s7, s3
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -976,11 +977,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W64: ; %bb.0: ; %entry
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0
-; GFX1030W64-NEXT: s_subb_u32 s1, s7, s1
+; GFX1030W64-NEXT: s_sub_u32 s0, s6, s2
+; GFX1030W64-NEXT: s_subb_u32 s1, s7, s3
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -990,10 +991,10 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s0, s6, s2
+; GFX11-NEXT: s_subb_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1029,74 +1030,74 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: ssub64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, 0x56789876, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, 0x1234, s3
+; VI-NEXT: s_sub_u32 s0, 0x56789876, s6
+; VI-NEXT: s_subb_u32 s1, 0x1234, s7
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: ssub64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX9-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: ssub64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1010-NEXT: v_mov_b32_e32 v0, s2
-; GFX1010-NEXT: v_mov_b32_e32 v1, s3
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1010-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: ssub64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1030W32-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1030W32-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: ssub64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1030W64-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1030W64-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: ssub64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX11-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1128,66 +1129,66 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: vsub64rr:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vsub64rr:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64rr:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: v_sub_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64rr:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, s6, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vsub64rr:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vsub64rr:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_sub_co_u32 v0, s0, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1219,65 +1220,66 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
;
; VI-LABEL: vsub64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x1234
; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vsub64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1010-NEXT: s_mov_b32 null, 0
+; GFX1010-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, 0x1234, 0, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vsub64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[0:1]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vsub64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1310,12 +1312,12 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: susubo32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sub_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1346,36 +1348,36 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_i32 s2, s2, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W32-NEXT: s_sub_i32 s0, s2, s3
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: susubo32:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_i32 s2, s2, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W64-NEXT: s_sub_i32 s0, s2, s3
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: susubo32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, s2, s3
+; GFX11-NEXT: s_sub_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1420,12 +1422,12 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: usubo32_vcc_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -1462,42 +1464,42 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1030W32-LABEL: usubo32_vcc_user:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT: v_sub_co_u32 v1, s0, s2, s3
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W32-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: usubo32_vcc_user:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT: v_sub_co_u32 v1, s[0:1], s2, s3
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W64-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: usubo32_vcc_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5
+; GFX11-NEXT: v_sub_co_u32 v1, s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1541,19 +1543,19 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; VI-LABEL: susubo64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_sub_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_subb_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -1561,80 +1563,80 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: susubo64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_sub_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_subb_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: susubo64:
; GFX1010: ; %bb.0:
-; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_sub_u32 s6, s4, s6
-; GFX1010-NEXT: s_subb_u32 s7, s5, s7
-; GFX1010-NEXT: v_mov_b32_e32 v0, s6
-; GFX1010-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT: v_mov_b32_e32 v1, s7
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT: s_sub_u32 s0, s8, s10
+; GFX1010-NEXT: s_subb_u32 s1, s9, s11
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: susubo64:
; GFX1030W32: ; %bb.0:
-; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: s_sub_u32 s0, s8, s10
+; GFX1030W32-NEXT: s_subb_u32 s1, s9, s11
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: susubo64:
; GFX1030W64: ; %bb.0:
-; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: s_sub_u32 s0, s8, s10
+; GFX1030W64-NEXT: s_subb_u32 s1, s9, s11
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], s[8:9]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: susubo64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s6, s4, s6
-; GFX11-NEXT: s_subb_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_sub_u32 s0, s8, s10
+; GFX11-NEXT: s_subb_u32 s1, s9, s11
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1678,13 +1680,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: vusubo64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v6, s1
-; VI-NEXT: v_sub_u32_e32 v5, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_sub_u32_e32 v5, vcc, s2, v0
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[5:6]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
@@ -1726,48 +1728,48 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-LABEL: vusubo64:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, s2, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vusubo64:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
-; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s2, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vusubo64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0
+; GFX11-NEXT: v_sub_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 84bd9b6..5c9762b 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -23,15 +23,15 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_add_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -40,24 +40,24 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -95,15 +95,15 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_multi_use_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -115,29 +115,29 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_multi_use_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e64 v2, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_multi_use_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -174,15 +174,15 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_dbg_use_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -191,24 +191,24 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_dbg_use_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_dbg_use_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,15 +244,15 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_add_neg_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_floor_f32_e32 v2, v3
@@ -262,27 +262,27 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_add_neg_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_floor_f32_e32 v1, v1
; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_neg_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_floor_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -318,15 +318,15 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_non_clamp_max_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -336,27 +336,27 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_non_clamp_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_non_clamp_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -389,15 +389,15 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_add_src_f32_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -406,24 +406,24 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_add_src_f32_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -459,15 +459,15 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_add_src_f16_denorm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -476,24 +476,24 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_add_src_f16_denorm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_denorm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -529,15 +529,15 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -546,24 +546,24 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -598,15 +598,15 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_add_src_v2f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
@@ -616,26 +616,26 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_add_src_v2f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -669,15 +669,15 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_add_src_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
@@ -686,24 +686,24 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -826,16 +826,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -846,24 +846,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -905,16 +905,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -925,24 +925,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -992,16 +992,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -1014,27 +1014,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1078,16 +1078,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -1099,27 +1099,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1165,16 +1165,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -1186,27 +1186,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1251,16 +1251,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1271,27 +1271,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1334,15 +1334,15 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -1354,27 +1354,27 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1419,16 +1419,16 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_no_clamp_add_packed_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1440,27 +1440,27 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_no_clamp_add_packed_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_packed_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1505,16 +1505,16 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v2, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v2, 1.0 clamp
@@ -1523,30 +1523,30 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 9472845..57e855f 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -24,15 +24,15 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -41,37 +41,37 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -105,15 +105,15 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, -v3, -v3 clamp
@@ -122,37 +122,37 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -187,15 +187,15 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, -|v3|, -|v3| clamp
@@ -204,37 +204,37 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -273,15 +273,15 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_negzero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -292,43 +292,43 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_negzero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negzero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negzero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -367,15 +367,15 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -386,43 +386,43 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -461,15 +461,15 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_multi_use_max_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -482,31 +482,31 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_multi_use_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_multi_use_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -515,16 +515,16 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX12-LABEL: v_clamp_multi_use_max_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1
; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1
-; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -563,15 +563,15 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, v3, v3 clamp
@@ -580,37 +580,37 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -645,15 +645,15 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, -v3, -v3 clamp
@@ -662,37 +662,37 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -728,15 +728,15 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, -|v3|, -|v3| clamp
@@ -745,37 +745,37 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -812,15 +812,15 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
@@ -829,37 +829,37 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -893,15 +893,15 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
@@ -910,37 +910,37 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -975,15 +975,15 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
@@ -992,37 +992,37 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1060,16 +1060,16 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
;
; GFX8-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_brev_b32 s0, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: s_brev_b32 s0, 1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, s0, 1.0, v3
@@ -1078,38 +1078,38 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_brev_b32 s0, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: s_brev_b32 s2, 1
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_med3_f32 v1, s0, 1.0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v1, 0x80000000, 1.0, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1141,15 +1141,15 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_aby_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1158,37 +1158,37 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_aby_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1220,15 +1220,15 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_bay_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1237,37 +1237,37 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_bay_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bay_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bay_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1299,15 +1299,15 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_yab_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1316,37 +1316,37 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_yab_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yab_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yab_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1378,15 +1378,15 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_yba_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1395,37 +1395,37 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_yba_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yba_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yba_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1457,15 +1457,15 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_ayb_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1474,37 +1474,37 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_ayb_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_ayb_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_ayb_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1536,15 +1536,15 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_bya_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1553,37 +1553,37 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_bya_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bya_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bya_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1611,41 +1611,41 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #
;
; GFX8-LABEL: v_clamp_constants_to_one_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 1.0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constants_to_one_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 1.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constants_to_one_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constants_to_one_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1670,41 +1670,41 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out)
;
; GFX8-LABEL: v_clamp_constants_to_zero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constants_to_zero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constants_to_zero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constants_to_zero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1730,41 +1730,41 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out)
;
; GFX8-LABEL: v_clamp_constant_preserve_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0.5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_preserve_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0.5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_preserve_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_preserve_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1790,41 +1790,41 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1849,41 +1849,41 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
;
; GFX8-LABEL: v_clamp_constant_qnan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_qnan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_qnan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_qnan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1908,41 +1908,41 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
;
; GFX8-LABEL: v_clamp_constant_snan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_snan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_snan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_snan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1977,15 +1977,15 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -1995,40 +1995,40 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2063,15 +2063,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 0.5 clamp
@@ -2080,37 +2080,37 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2146,15 +2146,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -2164,40 +2164,40 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2232,15 +2232,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
;
; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -2250,40 +2250,40 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
;
; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2318,15 +2318,15 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -2335,37 +2335,37 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2397,15 +2397,15 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -2414,37 +2414,37 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2476,15 +2476,15 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, v3, 0, 1.0
@@ -2493,37 +2493,37 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2555,15 +2555,15 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, v3, 1.0, 0
@@ -2572,37 +2572,37 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2634,15 +2634,15 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, 0, v3, 1.0
@@ -2651,37 +2651,37 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2713,15 +2713,15 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, 1.0, v3, 0
@@ -2730,37 +2730,37 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2788,41 +2788,41 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace
;
; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2848,41 +2848,41 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace
;
; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2918,15 +2918,15 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -2937,37 +2937,37 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3011,16 +3011,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_v2f16_undef_elt:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3035,37 +3035,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_v2f16_undef_elt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_elt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_elt:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3107,15 +3107,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
;
; GFX8-LABEL: v_clamp_v2f16_not_zero:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
@@ -3128,45 +3128,45 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_clamp_v2f16_not_zero:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_not_zero:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_not_zero:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3207,15 +3207,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_v2f16_not_one:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
@@ -3228,45 +3228,45 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_v2f16_not_one:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, 0
; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_not_one:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_not_one:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3307,15 +3307,15 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: v_clamp_neg_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3326,37 +3326,37 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_clamp_neg_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3398,15 +3398,15 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_negabs_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3417,42 +3417,42 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_negabs_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3496,15 +3496,15 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_neglo_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3515,37 +3515,37 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_neglo_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neglo_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neglo_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3588,15 +3588,15 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_neghi_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3607,37 +3607,37 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_neghi_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neghi_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neghi_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3680,15 +3680,15 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_v2f16_shuffle:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3699,37 +3699,37 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_v2f16_shuffle:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_shuffle:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_shuffle:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3774,16 +3774,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
;
; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3798,37 +3798,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
;
; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3872,16 +3872,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
;
; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3896,37 +3896,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
;
; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3961,70 +3961,70 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_diff_source_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8
-; GFX8-NEXT: s_add_u32 s0, s0, 12
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s6, s[6:7], 0x8
+; GFX8-NEXT: s_add_u32 s2, s4, 12
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_add_f32_e32 v0, s4, v0
-; GFX8-NEXT: v_add_f32_e32 v1, s4, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s6
+; GFX8-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX8-NEXT: v_add_f32_e32 v1, s0, v1
; GFX8-NEXT: v_max_f32_e64 v2, v0, v1 clamp
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_diff_source_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_add_f32_e32 v1, s0, v1
+; GFX9-NEXT: v_add_f32_e32 v2, s0, v2
; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5] offset:12
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_diff_source_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e64 v0, s4, s5
-; GFX11-NEXT: v_add_f32_e64 v1, s4, s2
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT: v_add_f32_e64 v1, s0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] offset:12
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_diff_source_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX12-NEXT: s_load_b96 s[0:2], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_f32 s2, s4, s5
-; GFX12-NEXT: s_add_f32 s3, s4, s6
+; GFX12-NEXT: s_add_f32 s1, s0, s1
+; GFX12-NEXT: s_add_f32 s0, s0, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX12-NEXT: s_max_num_f32 s2, s2, s3
-; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] offset:12
+; GFX12-NEXT: s_max_num_f32 s0, s1, s0
+; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 clamp
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] offset:12
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 9c7fa15..b969573c8 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -20,14 +20,14 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: add1:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -103,14 +103,14 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: sub1:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -450,15 +450,15 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: add_and:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_max_u32_e32 v1, 1, v1
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -493,14 +493,14 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: cmp_sub_sext:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -533,14 +533,14 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: cmp_sub_zext:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index c27e446..4b266d0 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -4,13 +4,13 @@
define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadCombine:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -37,14 +37,14 @@ entry:
define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadShuffle:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s0, 0x7050604
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
-; GCN-NEXT: s_mov_b32 s0, 0x7050604
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_perm_b32 v2, v2, v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index e9dbce9..52b9603 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_copy_v4i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -73,24 +73,23 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa
; VI-LABEL: test_copy_v4i8_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, s7
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -129,27 +128,27 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa
;
; VI-LABEL: test_copy_v4i8_x3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -199,31 +198,30 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s22, s10
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s23, s11
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s20, s6
-; VI-NEXT: s_mov_b32 s21, s7
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s22, s2
+; VI-NEXT: s_mov_b32 s23, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s20, s10
+; VI-NEXT: s_mov_b32 s21, s11
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0
@@ -280,22 +278,21 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; VI-LABEL: test_copy_v4i8_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, s7
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
@@ -310,7 +307,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -365,23 +362,23 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
;
; VI-LABEL: test_copy_v4i8_x2_extra_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
@@ -396,9 +393,9 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -433,19 +430,19 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
-; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -477,22 +474,22 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load <3 x i8>, ptr addrspace(1) %in, align 2
store <3 x i8> %val, ptr addrspace(1) %out, align 2
@@ -525,24 +522,24 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1
; VI-NEXT: s_endpgm
%val = load <3 x i8>, ptr addrspace(1) %in, align 1
store <3 x i8> %val, ptr addrspace(1) %out, align 1
@@ -569,19 +566,19 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p
;
; VI-LABEL: test_copy_v4i8_volatile_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load volatile <4 x i8>, ptr addrspace(1) %in, align 4
store <4 x i8> %val, ptr addrspace(1) %out, align 4
@@ -618,28 +615,28 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out,
;
; VI-LABEL: test_copy_v4i8_volatile_store:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1
; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1
+; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%val = load <4 x i8>, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
index 7dd95a0..f10fe68 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
@@ -8,21 +8,21 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252
-; GCN-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GCN-NEXT: s_cselect_b32 s2, 2, 3
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 2, 3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: global_store_dword v1, v0, s[4:5]
; GCN-NEXT: s_endpgm
entry: ; preds = %1009
%0 = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 332b601..848ac3b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -36,15 +36,15 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n
;
; VI-LABEL: s_ctlz_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s4, s4
-; VI-NEXT: s_min_u32 s4, s4, 32
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_flbit_i32_b32 s0, s2
+; VI-NEXT: s_min_u32 s0, s0, 32
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_i32:
@@ -88,14 +88,14 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n
; GFX11-LABEL: s_ctlz_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s2
+; GFX11-NEXT: s_clz_i32_u32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 32
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_min_u32 s0, s0, 32
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -612,16 +612,16 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctlz_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s4, s[4:5]
-; VI-NEXT: s_min_u32 s4, s4, 64
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: s_min_u32 s0, s0, 64
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_i64:
@@ -674,13 +674,13 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT: s_clz_i32_u64 s0, s[2:3]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 64
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-NEXT: s_min_u32 s0, s0, 64
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 97529b5..2dd3a7b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -41,13 +41,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_ctlz_zero_undef_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_flbit_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -331,14 +331,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
;
; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s2, 24
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 24
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -405,15 +405,15 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: s_add_i32 s2, s2, -16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: s_add_i32 s0, s0, -16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -479,13 +479,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_flbit_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1198,13 +1198,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
; VI-LABEL: s_ctlz_zero_undef_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2218,19 +2218,19 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_ctlz_zero_undef_i18:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x3ffff
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s2, s2, -14
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s0, s0, 2
+; VI-NEXT: s_and_b32 s0, s4, 0x3ffff
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_add_i32 s4, s0, -14
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_add_u32 s0, s2, 2
; VI-NEXT: flat_store_short v[0:1], v2
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_bfe_u32 s2, s4, 0x20010
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 4f2bde8..6e39b83 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val)
;
; VI-LABEL: s_ctpop_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s4, s4, 0xffff
-; VI-NEXT: s_bcnt1_i32_b32 s4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: s_bcnt1_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctpop_i16:
@@ -167,14 +167,14 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out,
; VI-LABEL: v_ctpop_add_chain_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1423,15 +1423,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou
; VI-LABEL: v_ctpop_i16_add_vvar_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s7, 0xf000
@@ -1521,29 +1521,29 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: ctpop_i16_in_br:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_cmp_lg_u32 s5, 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: s_cbranch_execnz .LBB14_3
; VI-NEXT: .LBB14_2: ; %if
-; VI-NEXT: s_and_b32 s2, s4, 0xffff
-; VI-NEXT: s_bcnt1_i32_b32 s2, s2
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: s_bcnt1_i32_b32 s0, s0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: .LBB14_3: ; %endif
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB14_4:
; VI-NEXT: ; implicit-def: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 633f120..bd451dc 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -28,14 +28,14 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctpop_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%truncctpop = trunc i64 %ctpop to i32
@@ -116,7 +116,7 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
; VI-LABEL: v_ctpop_i64_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
@@ -128,8 +128,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_or_b32_e32 v0, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_or_b32_e32 v0, s2, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -159,15 +159,15 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64
; VI-LABEL: s_ctpop_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_endpgm
%ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
%truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -197,19 +197,19 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64
; VI-LABEL: s_ctpop_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s15, 0xf000
+; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
-; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
+; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
+; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
%truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -424,15 +424,15 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val
; VI-LABEL: s_ctpop_i128:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_add_i32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
+; VI-NEXT: s_add_i32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
%truncctpop = trunc i128 %ctpop to i32
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 483402d..e1b01c0 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -35,15 +35,15 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n
;
; VI-LABEL: s_cttz_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s4, s4
-; VI-NEXT: s_min_u32 s4, s4, 32
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_ff1_i32_b32 s0, s2
+; VI-NEXT: s_min_u32 s0, s0, 32
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_cttz_i32:
@@ -519,16 +519,16 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_cttz_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_min_u32 s4, s4, 64
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT: s_min_u32 s0, s0, 64
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_cttz_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index a6cbfa5..7eb2e52 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -28,13 +28,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_cttz_zero_undef_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -317,13 +317,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
;
; VI-LABEL: s_cttz_zero_undef_i8_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -386,13 +386,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_cttz_zero_undef_i16_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -455,13 +455,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_cttz_zero_undef_i32_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index fd4e182..e6d68a1 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -2788,36 +2788,36 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; VI-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: cvt_ubyte0_or_multiuse:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX10-NEXT: global_load_dword v0, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: cvt_ubyte0_or_multiuse:
@@ -2836,17 +2836,17 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; GFX11-LABEL: cvt_ubyte0_or_multiuse:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v2, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index fed4b98..37b4dfa 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -8,13 +8,13 @@
define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: add:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_add v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -30,13 +30,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: sub:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -52,13 +52,13 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_and v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -74,13 +74,13 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: or:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_or v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -96,13 +96,13 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q
define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: xor:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -118,28 +118,28 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: nand:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_not_b32_e32 v0, v3
; CHECK-NEXT: v_or_b32_e32 v2, -2, v0
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -154,13 +154,13 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: max_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -176,13 +176,13 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr
define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: max:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -198,13 +198,13 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: min_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -220,13 +220,13 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr
define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: min:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -242,13 +242,13 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umax_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -264,13 +264,13 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add
define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umax:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -286,13 +286,13 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umin_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -308,13 +308,13 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add
define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umin:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -330,14 +330,14 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: cmpxchg:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -354,13 +354,13 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(
define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: xchg:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -376,13 +376,13 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: inc:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -398,13 +398,13 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: dec:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -420,28 +420,28 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fadd:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB18_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -457,28 +457,28 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fsub:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB19_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -494,14 +494,14 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fmin:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -519,14 +519,14 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fmax:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -547,13 +547,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -569,13 +569,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -591,13 +591,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -613,13 +613,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -635,13 +635,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -657,13 +657,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -679,13 +679,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -701,13 +701,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.and.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -723,13 +723,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.or.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -745,13 +745,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.xor.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -767,13 +767,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.inc.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -789,13 +789,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.dec.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -812,13 +812,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8)
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v1, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 1, i32 2, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -834,14 +834,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v1, 1.0
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f32 = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -859,14 +860,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f64 = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -884,14 +886,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f64 = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 67b0cef..cff77bf 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -42,13 +42,13 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
; GFX11-LABEL: uniform_vec_0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -127,13 +127,13 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
; GFX11-LABEL: uniform_vec_i16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -212,13 +212,13 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
; GFX11-LABEL: uniform_vec_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -277,12 +277,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_i16_LL:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
@@ -290,12 +290,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX906-LABEL: uniform_vec_i16_LL:
; GFX906: ; %bb.0:
-; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
@@ -303,10 +303,10 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX11-LABEL: uniform_vec_i16_LL:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -561,12 +561,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_f16_LL:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
@@ -574,12 +574,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX906-LABEL: uniform_vec_f16_LL:
; GFX906: ; %bb.0:
-; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
@@ -587,10 +587,10 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX11-LABEL: uniform_vec_f16_LL:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -723,13 +723,13 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in,
; GFX11-LABEL: build_vec_v2i16_undeflo_uniform:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: ds_load_u16_d16 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index b0e1da3..b5933b4 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -7,11 +7,11 @@
define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds1align1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_u8 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b8 v1, v0
; GCN-NEXT: s_endpgm
@@ -23,12 +23,12 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds2align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -37,12 +37,12 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds2align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
@@ -52,11 +52,11 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds2align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_u16 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b16 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -68,11 +68,11 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds2align2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_u16 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b16 v1, v0
; GCN-NEXT: s_endpgm
@@ -84,14 +84,14 @@ define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds4align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -104,15 +104,15 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds4align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
@@ -130,11 +130,11 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds4align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b32 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b32 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -146,12 +146,12 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds4align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -160,12 +160,12 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds4align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0
@@ -174,11 +174,11 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds4align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b32 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b32 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -190,11 +190,11 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds4align4:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
@@ -206,9 +206,9 @@ define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds8align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -217,7 +217,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5
; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -234,9 +234,9 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds8align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -258,7 +258,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8
@@ -275,11 +275,11 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds8align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
@@ -291,14 +291,14 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds8align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -311,14 +311,14 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds8align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -331,11 +331,11 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds8align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
@@ -347,11 +347,11 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds8align4:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GCN-NEXT: s_endpgm
@@ -363,11 +363,11 @@ define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds8align8:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b64 v[0:1], v0
-; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b64 v2, v[0:1]
; GCN-NEXT: s_endpgm
@@ -379,9 +379,9 @@ define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -394,7 +394,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9
; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -415,9 +415,9 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -449,7 +449,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2
; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1
@@ -473,11 +473,11 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds12align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-NEXT: s_endpgm
@@ -489,15 +489,15 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4
; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s3
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8
@@ -513,16 +513,16 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6
; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
@@ -539,11 +539,11 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds12align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-NEXT: s_endpgm
@@ -555,12 +555,12 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-LABEL: ds12align4:
; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds12align4:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -583,11 +583,11 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds12align4:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -599,12 +599,12 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align8:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2
; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1]
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -613,12 +613,12 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align8:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
@@ -627,12 +627,12 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds12align8:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8
; UNALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -641,11 +641,11 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds12align8:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -657,11 +657,11 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds12align16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b96 v[0:2], v0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b96 v3, v[0:2]
; GCN-NEXT: s_endpgm
@@ -673,9 +673,9 @@ define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %o
define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds16align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -692,7 +692,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13
; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -716,9 +716,9 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds16align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -760,7 +760,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s3
; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
@@ -789,11 +789,11 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds16align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
@@ -805,9 +805,9 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds16align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
@@ -815,7 +815,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8
; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s3
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12
@@ -835,9 +835,9 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds16align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
@@ -850,7 +850,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -867,11 +867,11 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds16align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
@@ -883,12 +883,12 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-LABEL: ds16align4:
; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; ALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
@@ -897,12 +897,12 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds16align4:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -911,11 +911,11 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds16align4:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -927,11 +927,11 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds16align8:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GCN-NEXT: s_endpgm
@@ -943,11 +943,11 @@ define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds16align16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b128 v[0:3], v0
-; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b128 v4, v[0:3]
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
index 5814b8a..4cd5835 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -36,7 +36,7 @@ define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace
; GCN-LABEL: {{^}}ds_combine_WAR
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
-; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) {
%addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 06908d2..ee374bd 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
@@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
@@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C
;
; GFX9-LABEL: simple_write2_two_val_too_far_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr
;
; GFX9-LABEL: simple_write2_two_val_f32_x2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa
;
; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-ALIGNED: ; %bb.0:
-; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX9-ALIGNED-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
@@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX9-UNALIGNED-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index db3ea4d..e16bb28 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -411,15 +411,15 @@ entry:
; GCN-LABEL: {{^}}bit4_extelt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 3
-; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2
-; GCN-NEXT: s_and_b32 s2, s2, 1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_lshl_b32 s0, s4, 3
+; GCN-NEXT: s_lshr_b32 s0, 0x1000100, s0
+; GCN-NEXT: s_and_b32 s0, s0, 1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 44d65c9..6823dcf 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -21,32 +21,32 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: extract_vector_elt_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:20
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20
+; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[4:5]
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:20
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -140,6 +140,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -147,35 +148,33 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_load_dword s1, s[2:3], 0x0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s1
+; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,18 +316,18 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: v_extractelement_v4f16_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_short v[0:1], v2
@@ -336,13 +335,13 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
;
; GFX11-LABEL: v_extractelement_v4f16_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4
+; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -380,43 +379,42 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %
;
; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b64 v[0:1], v0, v[1:2]
-; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
; VI-NEXT: flat_store_short v[1:2], v0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc
+; GFX11-NEXT: buffer_load_b32 v3, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2]
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -451,12 +449,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
;
; VI-LABEL: reduce_load_vector_v8f16_extract_01:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -468,12 +466,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
;
; GFX11-LABEL: reduce_load_vector_v8f16_extract_01:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -512,12 +510,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
;
; VI-LABEL: reduce_load_vector_v8f16_extract_23:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s0, s[2:3], 0x4
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x4
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -529,12 +527,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
;
; GFX11-LABEL: reduce_load_vector_v8f16_extract_23:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x4
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 8f0d639..b243450 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -149,19 +149,19 @@ define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
;
; VI-LABEL: fabsf_v4f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: s_bitset0_b32 s1, 31
-; VI-NEXT: s_bitset0_b32 s0, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_and_b32 s0, s7, 0x7fffffff
+; VI-NEXT: s_and_b32 s1, s6, 0x7fffffff
+; VI-NEXT: s_bitset0_b32 s5, 31
+; VI-NEXT: s_bitset0_b32 s4, 31
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index cdc6b5a..00d77de 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -36,50 +36,50 @@ define amdgpu_kernel void @fadd_f16(
; VI-LABEL: fadd_f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s4
-; VI-NEXT: s_mov_b32 s9, s5
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s11
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s10
+; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s4
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
-; GFX11-SDAG-NEXT: s_mov_b32 s6, s10
-; GFX11-SDAG-NEXT: s_mov_b32 s7, s11
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -88,17 +88,17 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-GISEL-NEXT: s_mov_b32 s10, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX11-GISEL-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -112,24 +112,24 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1
; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s10
+; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, s2
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s8, s4
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s9, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s10
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s11
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
@@ -138,17 +138,17 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
@@ -216,94 +216,94 @@ define amdgpu_kernel void @fadd_f16_imm_a(
;
; VI-LABEL: fadd_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16_imm_a:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x3c00
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_f16_imm_a:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x3c00
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -360,94 +360,94 @@ define amdgpu_kernel void @fadd_f16_imm_b(
;
; VI-LABEL: fadd_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16_imm_b:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x4000
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_f16_imm_b:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x4000
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 2.0, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 2.0, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -566,12 +566,12 @@ define amdgpu_kernel void @fadd_v2f16(
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -606,12 +606,12 @@ define amdgpu_kernel void @fadd_v2f16(
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -684,85 +684,85 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
;
; VI-LABEL: fadd_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 0x4000
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_v2f16_imm_a:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_v2f16_imm_a:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -823,85 +823,85 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
;
; VI-LABEL: fadd_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_v2f16_imm_b:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_v2f16_imm_b:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 581b7b4..fb47dae 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -21,20 +21,20 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_undef_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_undef_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v0, s[0:1]
+; GFX9-NEXT: global_store_short v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_undef_value_f16:
@@ -49,10 +49,10 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_undef_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -64,10 +64,10 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
@@ -76,10 +76,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
;
; GFX9-LABEL: v_test_canonicalize_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: global_store_short v[0:1], v0, off
@@ -100,10 +100,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
;
; GFX11-LABEL: v_test_canonicalize_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: global_store_b16 v[0:1], v0, off
@@ -119,12 +119,12 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
; VI-LABEL: s_test_canonicalize_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -153,12 +153,12 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; GFX11-LABEL: s_test_canonicalize_var_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v1, s2, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_max_f16_e64 v1, s4, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -239,10 +239,10 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, |v2|, |v2|
@@ -251,13 +251,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_canonicalize_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fabs_var_f16:
@@ -275,13 +275,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
;
; GFX11-LABEL: v_test_canonicalize_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -295,10 +295,10 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, -|v2|, -|v2|
@@ -307,13 +307,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
;
; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
@@ -331,13 +331,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
;
; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -352,10 +352,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, -v2, -v2
@@ -364,13 +364,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_canonicalize_fneg_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_var_f16:
@@ -388,13 +388,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
;
; GFX11-LABEL: v_test_canonicalize_fneg_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -408,10 +408,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 {
; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_e32 v2, -1.0, v2
@@ -420,13 +420,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
;
; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
@@ -444,13 +444,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
;
; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,10 +464,10 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 {
; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_e64 v2, -1.0, |v2|
@@ -476,13 +476,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
;
; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
@@ -500,13 +500,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
;
; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -521,20 +521,20 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v0, s[0:1]
+; GFX9-NEXT: global_store_short v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p0_f16:
@@ -549,10 +549,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_p0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -564,21 +564,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n0_f16:
@@ -593,10 +593,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_n0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -608,21 +608,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p1_f16:
@@ -637,10 +637,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_p1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -652,21 +652,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n1_f16:
@@ -681,10 +681,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_n1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -696,21 +696,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_literal_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_literal_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_literal_f16:
@@ -725,10 +725,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
;
; GFX11-LABEL: test_fold_canonicalize_literal_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -740,21 +740,21 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
@@ -769,10 +769,10 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
;
; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -784,21 +784,21 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
@@ -813,10 +813,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -828,21 +828,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
@@ -857,10 +857,10 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
;
; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -872,21 +872,21 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
@@ -901,10 +901,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -916,21 +916,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_f16:
@@ -945,10 +945,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_qnan_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -960,21 +960,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
@@ -989,10 +989,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1004,21 +1004,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
@@ -1033,10 +1033,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1048,21 +1048,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan0_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan0_value_f16:
@@ -1077,10 +1077,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1092,21 +1092,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan1_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan1_value_f16:
@@ -1121,10 +1121,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1136,21 +1136,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan2_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan2_value_f16:
@@ -1165,10 +1165,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1180,21 +1180,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan3_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan3_value_f16:
@@ -1209,10 +1209,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1224,32 +1224,32 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_var_v2f16:
@@ -1277,13 +1277,13 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: v_test_canonicalize_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1298,33 +1298,33 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fabs_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, |v0|, |v0|
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fabs_var_v2f16:
@@ -1352,15 +1352,15 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
;
; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1376,33 +1376,33 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, -|v0|, -|v0|
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
@@ -1431,15 +1431,15 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
;
; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1456,32 +1456,32 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_var_v2f16:
@@ -1510,13 +1510,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
;
; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1532,16 +1532,16 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
; VI-LABEL: s_test_canonicalize_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_max_f16_e64 v0, s2, s2
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1575,12 +1575,12 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
; GFX11-LABEL: s_test_canonicalize_var_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1593,20 +1593,20 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p0_v2f16:
@@ -1621,10 +1621,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_p0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1636,21 +1636,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x80008000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n0_v2f16:
@@ -1665,10 +1665,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_n0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1680,21 +1680,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p1_v2f16:
@@ -1709,10 +1709,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_p1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1724,21 +1724,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n1_v2f16:
@@ -1753,10 +1753,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_n1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1768,21 +1768,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_literal_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_literal_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_literal_v2f16:
@@ -1797,10 +1797,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1)
;
; GFX11-LABEL: test_fold_canonicalize_literal_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1812,21 +1812,21 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1)
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
@@ -1841,10 +1841,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p
;
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1856,21 +1856,21 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
@@ -1885,10 +1885,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1900,21 +1900,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
@@ -1929,10 +1929,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p
;
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1944,21 +1944,21 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
@@ -1973,10 +1973,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1988,21 +1988,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr
define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_v2f16:
@@ -2017,10 +2017,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o
;
; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2032,21 +2032,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
@@ -2061,10 +2061,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2076,21 +2076,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
@@ -2105,10 +2105,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2120,21 +2120,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
@@ -2149,10 +2149,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2164,21 +2164,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
@@ -2193,10 +2193,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2208,21 +2208,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
@@ -2237,10 +2237,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2252,21 +2252,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
@@ -2281,10 +2281,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2376,20 +2376,20 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: s_test_canonicalize_undef_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_canonicalize_undef_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: s_test_canonicalize_undef_v2f16:
@@ -2404,10 +2404,10 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: s_test_canonicalize_undef_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2678,22 +2678,22 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: s_test_canonicalize_undef_v4f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: s_test_canonicalize_undef_v4f16:
@@ -2709,12 +2709,12 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
;
; GFX11-LABEL: s_test_canonicalize_undef_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 7d8f43b..038aad3 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -58,25 +58,25 @@ define amdgpu_kernel void @fcmp_f16_lt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -147,26 +147,26 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e64 s0, |v0|, |v1|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -239,25 +239,25 @@ define amdgpu_kernel void @fcmp_f16_eq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -328,25 +328,25 @@ define amdgpu_kernel void @fcmp_f16_le(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -417,25 +417,25 @@ define amdgpu_kernel void @fcmp_f16_gt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -506,25 +506,25 @@ define amdgpu_kernel void @fcmp_f16_lg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -595,25 +595,25 @@ define amdgpu_kernel void @fcmp_f16_ge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -684,25 +684,25 @@ define amdgpu_kernel void @fcmp_f16_o(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -773,25 +773,25 @@ define amdgpu_kernel void @fcmp_f16_u(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -862,25 +862,25 @@ define amdgpu_kernel void @fcmp_f16_nge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -951,25 +951,25 @@ define amdgpu_kernel void @fcmp_f16_nlg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1040,25 +1040,25 @@ define amdgpu_kernel void @fcmp_f16_ngt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1129,25 +1129,25 @@ define amdgpu_kernel void @fcmp_f16_nle(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1218,25 +1218,25 @@ define amdgpu_kernel void @fcmp_f16_neq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1307,25 +1307,25 @@ define amdgpu_kernel void @fcmp_f16_nlt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1406,20 +1406,20 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1429,7 +1429,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1511,20 +1511,20 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1534,7 +1534,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1615,20 +1615,20 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1638,7 +1638,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1719,20 +1719,20 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1742,7 +1742,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1824,20 +1824,20 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1847,7 +1847,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1929,20 +1929,20 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1952,7 +1952,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2034,20 +2034,20 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2057,7 +2057,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2139,20 +2139,20 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2162,7 +2162,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2243,20 +2243,20 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2266,7 +2266,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2347,20 +2347,20 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2370,7 +2370,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2452,20 +2452,20 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2475,7 +2475,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2556,20 +2556,20 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2579,7 +2579,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2660,20 +2660,20 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2683,7 +2683,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2764,20 +2764,20 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2787,7 +2787,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fd80580..b2fadbd 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -31,16 +31,16 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
;
; VI-LABEL: s_copysign_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s3, 0x7fff
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_bfi_b32 v2, s3, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -61,15 +61,15 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
; GFX11-LABEL: s_copysign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -93,13 +93,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
;
; VI-LABEL: s_test_copysign_f16_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -117,13 +117,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
; GFX11-LABEL: s_test_copysign_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -147,13 +147,13 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
;
; VI-LABEL: s_test_copysign_f16_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -171,13 +171,13 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
; GFX11-LABEL: s_test_copysign_f16_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -201,13 +201,13 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_10.0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -225,13 +225,13 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_10.0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -255,13 +255,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 15
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -279,13 +279,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_neg1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_or_b32 s0, s4, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,13 +309,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_neg10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 15
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -333,13 +333,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_neg10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_or_b32 s0, s4, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -365,13 +365,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_and_b32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -389,12 +389,12 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_0_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -421,14 +421,14 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -447,14 +447,14 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -481,14 +481,14 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
;
; VI-LABEL: s_test_copysign_f16_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -507,14 +507,14 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
; GFX11-LABEL: s_test_copysign_f16_10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -540,14 +540,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
;
; VI-LABEL: s_test_copysign_f16_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -566,14 +566,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -600,14 +600,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f16_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -626,14 +626,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -850,19 +850,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -950,19 +950,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -993,13 +993,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
;
; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v1, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1050,19 +1051,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1093,19 +1094,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[4:5]
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1151,19 +1152,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1194,19 +1195,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v1, s[4:5]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1252,19 +1253,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v2, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1295,19 +1296,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[4:5]
-; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1353,18 +1354,18 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1393,19 +1394,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1453,19 +1454,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1497,12 +1498,12 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1904,29 +1905,29 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; VI-LABEL: s_copysign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x7fff
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_lshr_b32 s3, s6, 16
+; VI-NEXT: s_lshr_b32 s1, s6, 16
; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
-; VI-NEXT: s_add_u32 s2, s0, 4
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: v_bfi_b32 v3, s0, v0, v1
+; VI-NEXT: s_add_u32 s0, s2, 4
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1958,24 +1959,24 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s2, s6, 16
+; GFX11-NEXT: s_lshr_b32 s0, s6, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_lshr_b32 s2, s4, 16
+; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: v_mov_b32_e32 v2, s7
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4
-; GFX11-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v2, s[2:3] offset:4
+; GFX11-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2023,31 +2024,31 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; VI-LABEL: s_copysign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x7fff
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_lshr_b32 s3, s7, 16
+; VI-NEXT: s_lshr_b32 s1, s7, 16
; VI-NEXT: s_lshr_b32 s5, s5, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_lshr_b32 s3, s6, 16
+; VI-NEXT: s_lshr_b32 s1, s6, 16
; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v2
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v2
; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_bfi_b32 v2, s2, v2, v3
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2085,26 +2086,26 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7
; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: s_lshr_b32 s2, s7, 16
+; GFX11-NEXT: s_lshr_b32 s0, s7, 16
; GFX11-NEXT: s_lshr_b32 s6, s6, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s5, v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1
-; GFX11-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-NEXT: s_lshr_b32 s2, s4, 16
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3
+; GFX11-NEXT: s_lshr_b32 s1, s5, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index fb04b66..3f5d90e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -63,26 +63,26 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m
;
; VI-LABEL: s_test_copysign_f32_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -106,26 +106,26 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m
;
; VI-LABEL: s_test_copysign_f32_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -149,26 +149,26 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float
;
; VI-LABEL: s_test_copysign_f32_10.0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_10.0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -192,26 +192,26 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float
;
; VI-LABEL: s_test_copysign_f32_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_or_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -235,26 +235,26 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_neg10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_or_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -278,26 +278,26 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_0_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -323,28 +323,28 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,28 +369,28 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo
;
; VI-LABEL: s_test_copysign_f32_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 0x41200000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 0x41200000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -415,28 +415,28 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f
;
; VI-LABEL: s_test_copysign_f32_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -461,28 +461,28 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f32_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 0x41200000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 0x41200000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -512,17 +512,17 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo
; VI-LABEL: s_test_copysign_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_bfi_b32 v0, s2, v2, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_bfi_b32 v0, s0, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -530,14 +530,14 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -571,20 +571,20 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; VI-LABEL: s_test_copysign_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s7, -2
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v2, s7, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: v_bfi_b32 v1, s2, v3, v0
+; VI-NEXT: v_bfi_b32 v1, s7, v3, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v3, s8
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v3
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_bfi_b32 v0, s7, v0, v3
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9
@@ -602,7 +602,7 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3
-; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -638,23 +638,23 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; VI-LABEL: s_test_copysign_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s12, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s10
-; VI-NEXT: v_bfi_b32 v2, s2, v2, v0
+; VI-NEXT: v_bfi_b32 v2, s12, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v1, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_bfi_b32 v0, s12, v0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -662,7 +662,7 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10
@@ -673,7 +673,7 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -921,16 +921,16 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
;
; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s2, -2
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -940,12 +940,12 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -972,25 +972,25 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o
;
; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 1.0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1063,31 +1063,31 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out
;
; VI-LABEL: s_test_copysign_f32_1_fpext_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 16
+; VI-NEXT: s_and_b32 s0, s0, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
+; GFX11-NEXT: s_and_b32 s0, s0, 0x80000000
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index b5fa3fd..5d5a4e9 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -29,15 +29,15 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s4, v0, v1
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -46,14 +46,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -79,13 +79,13 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: s_test_copysign_f64_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -93,13 +93,13 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -125,13 +125,13 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: s_test_copysign_f64_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -139,13 +139,13 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -171,13 +171,13 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: s_test_copysign_f64_10:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -185,13 +185,13 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -217,13 +217,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
; VI-LABEL: s_test_copysign_f64_neg1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -231,13 +231,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_or_b32 s0, s3, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -263,13 +263,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
; VI-LABEL: s_test_copysign_f64_neg10:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -277,13 +277,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_or_b32 s0, s3, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -312,32 +312,32 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i
; VI-LABEL: s_test_copysign_f64_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dword s4, s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s5, -2
+; VI-NEXT: s_load_dword s6, s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s5, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -366,33 +366,33 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i
;
; VI-LABEL: s_test_copysign_f64_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x74
+; VI-NEXT: s_load_dword s6, s[0:1], 0x74
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s5, -2
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4
+; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s5, v1, v0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
+; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -419,24 +419,24 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub
;
; VI-LABEL: s_test_copysign_f64_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_0_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -463,26 +463,26 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub
;
; VI-LABEL: s_test_copysign_f64_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_1_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -509,26 +509,26 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou
;
; VI-LABEL: s_test_copysign_f64_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x40240000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_10_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -555,26 +555,26 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d
;
; VI-LABEL: s_test_copysign_f64_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_neg1_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -601,26 +601,26 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f64_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x40240000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_neg10_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -652,19 +652,19 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; VI-LABEL: s_test_copysign_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s8, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s8, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_bfi_b32 v1, s8, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -672,7 +672,7 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11
; GFX11-NEXT: v_mov_b32_e32 v2, s9
@@ -681,7 +681,7 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2
; GFX11-NEXT: v_mov_b32_e32 v2, s6
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -718,28 +718,28 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; VI-LABEL: s_test_copysign_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s10, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s15
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s10, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_bfi_b32 v1, s10, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_bfi_b32 v5, s10, v0, v2
; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -747,7 +747,7 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15
; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4
@@ -758,8 +758,8 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v7
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -801,32 +801,32 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; VI-LABEL: s_test_copysign_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s12, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s15
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_bfi_b32 v1, s12, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v2, s19
-; VI-NEXT: v_bfi_b32 v7, s2, v0, v2
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_bfi_b32 v7, s12, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: v_bfi_b32 v5, s12, v0, v2
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v6, s10
-; VI-NEXT: v_mov_b32_e32 v8, s2
+; VI-NEXT: v_mov_b32_e32 v8, s0
; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -834,7 +834,7 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15
; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10
@@ -848,8 +848,8 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; GFX11-NEXT: v_mov_b32_e32 v6, s6
; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s5, v5
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index b14b642..cfb608c 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -47,14 +47,14 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX8-LABEL: v_fdiv_f16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -111,12 +111,12 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -178,52 +178,52 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX8-LABEL: v_rcp_f16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -272,52 +272,52 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_abs:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e64 v3, |v0|
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_abs:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_abs:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_abs:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,52 +369,52 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
;
; GFX8-LABEL: reciprocal_f16_rounded:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: reciprocal_f16_rounded:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: reciprocal_f16_rounded:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: reciprocal_f16_rounded:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -450,52 +450,52 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_afn:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_afn:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_afn:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_afn:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -544,52 +544,52 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_neg:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e64 v3, -v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_neg:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_neg:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_neg:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -641,52 +641,52 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX8-LABEL: v_rsq_f16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rsq_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -739,17 +739,17 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rsq_f16_neg:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rsq_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
@@ -757,39 +757,39 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX9-LABEL: v_rsq_f16_neg:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_neg:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_neg:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -844,16 +844,16 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
;
; GFX8-LABEL: v_rsq_f16_multi_use:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_rsq_f16_e32 v4, v3
; GFX8-NEXT: flat_store_short v[0:1], v3
@@ -863,41 +863,41 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
;
; GFX9-LABEL: v_rsq_f16_multi_use:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v2, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_multi_use:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v2, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_multi_use:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v2, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v2, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -951,57 +951,57 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
;
; GFX8-LABEL: v_rsq_f16_missing_contract0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16_missing_contract0:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract0:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract0:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1054,57 +1054,57 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
;
; GFX8-LABEL: v_rsq_f16_missing_contract1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1157,57 +1157,57 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
;
; GFX8-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e64 v3, -v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1251,14 +1251,14 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX8-LABEL: v_fdiv_f16_afn:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1307,12 +1307,12 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1362,14 +1362,14 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX8-LABEL: v_fdiv_f16_unsafe:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1418,12 +1418,12 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1463,46 +1463,46 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_2_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_2_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_2_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_2_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1530,46 +1530,46 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_k_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_k_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_k_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1597,46 +1597,46 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_neg_k_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_neg_k_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_neg_k_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_neg_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index b639768..92db799 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -1077,7 +1077,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-LABEL: s_fdiv_v2f32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0
@@ -1097,6 +1096,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v5, v2
; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -1108,8 +1108,9 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -1120,7 +1121,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -1132,6 +1132,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5
; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4
@@ -1147,14 +1148,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5
; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5
@@ -1185,7 +1186,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4
; GFX11-NEXT: v_div_fixup_f32 v0, v0, s6, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1227,12 +1228,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float>
; GFX8-LABEL: s_fdiv_ulp25_v2f32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s6
; GFX8-NEXT: v_rcp_f32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -1256,14 +1257,14 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float>
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s6
; GFX11-NEXT: v_rcp_f32_e32 v1, s7
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1305,14 +1306,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
; GFX8-LABEL: s_fdiv_v2f32_fast_math:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s7
; GFX8-NEXT: v_rcp_f32_e32 v2, s6
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1334,14 +1335,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1383,14 +1384,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s7
; GFX8-NEXT: v_rcp_f32_e32 v2, s6
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1412,14 +1413,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2179,10 +2180,10 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v2, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -2194,18 +2195,19 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s4, 1.0
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -2217,19 +2219,19 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_denorm_mode 15
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -2242,8 +2244,8 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2331,10 +2333,10 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v2, v0
; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0
; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2
@@ -2344,52 +2346,53 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s4, 1.0
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index c56b4ae..fede468 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -5,28 +5,28 @@
define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float 1.000000e+00, %load, !fpmath !0
@@ -37,28 +37,28 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float -1.000000e+00, %load, !fpmath !0
@@ -69,28 +69,28 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fneg float %load
@@ -102,28 +102,28 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
@@ -512,13 +512,13 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) {
define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_x_fast:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv fast float 1.000000e+00, %load, !fpmath !0
@@ -529,25 +529,25 @@ define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_fast:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_fast:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv fast float -1.000000e+00, %load, !fpmath !0
@@ -558,13 +558,13 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_minus_x_fast:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fneg float %load, !fpmath !0
@@ -576,25 +576,25 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load, !fpmath !0
@@ -606,11 +606,11 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -622,16 +622,16 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -645,7 +645,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float 1.000000e+00, %load
@@ -656,11 +656,11 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -672,16 +672,16 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -695,7 +695,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float -1.000000e+00, %load
@@ -706,11 +706,11 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -722,16 +722,16 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], -s4, -s4, 1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], -s4, -s4, 1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, 1.0, -s4, 1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -745,7 +745,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, -s4, 1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
@@ -757,11 +757,11 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -773,16 +773,16 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], -s4, -s4, -1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], -s4, -s4, -1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, -1.0, -s4, -1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -796,7 +796,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, -s4, -1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index ab3650f..e0abaa6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4354,14 +4354,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 16
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4606,12 +4606,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
; GCN2-LABEL: atomic_cmpxchg_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5263,31 +5263,31 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i32_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5314,29 +5314,29 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i32:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5368,12 +5368,12 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5381,8 +5381,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5432,19 +5432,19 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_i32_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5673,31 +5673,31 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f32_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5724,29 +5724,29 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f32:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5778,12 +5778,12 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_f32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5791,8 +5791,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5842,19 +5842,19 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_f32_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -6083,31 +6083,31 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i8_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6134,29 +6134,29 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i8:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6188,10 +6188,10 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %
; GCN2-LABEL: atomic_load_i8_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s4, s0
-; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: s_add_u32 s0, s4, s2
+; GCN2-NEXT: s_addc_u32 s1, s5, s3
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6378,31 +6378,31 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i16_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6429,29 +6429,29 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i16:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6483,12 +6483,12 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i16_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6496,8 +6496,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -7963,31 +7963,31 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f16_offset:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16_offset:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr half, ptr %in, i64 8
@@ -8013,29 +8013,29 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f16:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic half, ptr %in seq_cst, align 2
@@ -8062,31 +8062,31 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_bf16_offset:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16_offset:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr %in, i64 8
@@ -8112,29 +8112,29 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_bf16:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic bfloat, ptr %in seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 816142d..1d204ac 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3953,14 +3953,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -3971,7 +3971,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3981,8 +3981,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -4152,14 +4152,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
;
; GCN2-LABEL: atomic_max_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -4168,7 +4168,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4178,8 +4178,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5096,14 +5096,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5114,7 +5114,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_u32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5124,8 +5124,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5205,14 +5205,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -5221,7 +5221,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_u32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5231,8 +5231,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -6890,14 +6890,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6908,7 +6908,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_min_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6918,8 +6918,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -7076,14 +7076,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
;
; GCN2-LABEL: atomic_min_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -7092,7 +7092,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_min_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -7102,8 +7102,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index b8c8d99..fa5a0db 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -21,13 +21,13 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_add_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -36,10 +36,10 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_add_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -72,20 +72,20 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_add_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -93,10 +93,10 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -133,10 +133,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_add_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -152,12 +152,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_add_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32
@@ -195,38 +195,38 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_add_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -253,12 +253,12 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_add_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -266,10 +266,10 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_add_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -300,12 +300,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_add_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -318,10 +318,10 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -355,10 +355,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_add_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -372,12 +372,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_add_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1]
@@ -412,36 +412,36 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_add_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -469,13 +469,13 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_and_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -484,10 +484,10 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_and_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -520,20 +520,20 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_and_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -541,10 +541,10 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -581,10 +581,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_and_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -600,12 +600,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_and_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32
@@ -643,38 +643,38 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_and_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -701,12 +701,12 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_and_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -714,10 +714,10 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_and_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -748,12 +748,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_and_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -766,10 +766,10 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -803,10 +803,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_and_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -820,12 +820,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_and_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1]
@@ -860,36 +860,36 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_and_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -917,13 +917,13 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_sub_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -932,10 +932,10 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_sub_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -968,20 +968,20 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_sub_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -989,10 +989,10 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1029,10 +1029,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_sub_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1048,12 +1048,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_sub_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32
@@ -1091,38 +1091,38 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1149,12 +1149,12 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_sub_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -1162,10 +1162,10 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_sub_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1196,12 +1196,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_sub_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -1214,10 +1214,10 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1251,10 +1251,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_sub_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -1268,12 +1268,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_sub_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1]
@@ -1308,36 +1308,36 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_sub_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1364,13 +1364,13 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_max_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -1378,10 +1378,10 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_max_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1414,19 +1414,19 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -1435,10 +1435,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1474,10 +1474,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_max_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1492,12 +1492,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_max_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32
@@ -1535,38 +1535,38 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1592,22 +1592,22 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_max_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1638,12 +1638,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_max_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -1656,10 +1656,10 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1692,10 +1692,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_max_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -1708,12 +1708,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_max_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1]
@@ -1748,36 +1748,36 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1804,13 +1804,13 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umax_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -1818,10 +1818,10 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_umax_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1854,19 +1854,19 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -1875,10 +1875,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1914,10 +1914,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1932,12 +1932,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_umax_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32
@@ -1975,38 +1975,38 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2032,22 +2032,22 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umax_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2078,12 +2078,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_umax_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2096,10 +2096,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2132,10 +2132,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_umax_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -2148,12 +2148,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_umax_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1]
@@ -2188,36 +2188,36 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2244,13 +2244,13 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -2258,10 +2258,10 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_min_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2294,19 +2294,19 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2354,10 +2354,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_min_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -2372,12 +2372,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_min_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32
@@ -2415,38 +2415,38 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2472,22 +2472,22 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2518,12 +2518,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_min_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2536,10 +2536,10 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2572,10 +2572,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_min_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -2588,12 +2588,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_min_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1]
@@ -2628,36 +2628,36 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2684,13 +2684,13 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umin_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -2698,10 +2698,10 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_umin_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2734,19 +2734,19 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umin_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -2755,10 +2755,10 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2794,10 +2794,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_umin_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -2812,12 +2812,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_umin_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32
@@ -2855,38 +2855,38 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2912,22 +2912,22 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umin_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2958,12 +2958,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_umin_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2976,10 +2976,10 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3012,10 +3012,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_umin_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -3028,12 +3028,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_umin_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1]
@@ -3068,36 +3068,36 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umin_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3125,13 +3125,13 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_or_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3140,10 +3140,10 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_or_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3176,20 +3176,20 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
;
; GCN2-LABEL: atomic_or_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -3197,10 +3197,10 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3237,10 +3237,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GCN2-LABEL: atomic_or_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -3256,12 +3256,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GFX12-LABEL: atomic_or_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32
@@ -3299,38 +3299,38 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_or_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3357,12 +3357,12 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_or_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3370,10 +3370,10 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_or_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3404,12 +3404,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_or_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3422,10 +3422,10 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3459,10 +3459,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GCN2-LABEL: atomic_or_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -3476,12 +3476,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX12-LABEL: atomic_or_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1]
@@ -3516,36 +3516,36 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
;
; GCN2-LABEL: atomic_or_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3573,13 +3573,13 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3588,10 +3588,10 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3620,13 +3620,13 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
;
; GCN2-LABEL: atomic_xchg_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3635,10 +3635,10 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
;
; GFX12-LABEL: atomic_xchg_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3667,13 +3667,13 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
;
; GCN2-LABEL: atomic_xchg_pointer_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3682,10 +3682,10 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
;
; GFX12-LABEL: atomic_xchg_pointer_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3718,20 +3718,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_xchg_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -3739,10 +3739,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3779,10 +3779,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_xchg_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -3798,12 +3798,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32
@@ -3841,38 +3841,38 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3899,12 +3899,12 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xchg_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3912,10 +3912,10 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xchg_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3946,12 +3946,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_xchg_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3964,10 +3964,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4001,10 +4001,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_xchg_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4018,12 +4018,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_xchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1]
@@ -4058,36 +4058,36 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_xchg_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4115,13 +4115,13 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xor_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4130,10 +4130,10 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xor_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4166,20 +4166,20 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_xor_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -4187,10 +4187,10 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4227,10 +4227,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_xor_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -4246,12 +4246,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_xor_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32
@@ -4289,38 +4289,38 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4347,12 +4347,12 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xor_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4360,10 +4360,10 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xor_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4394,12 +4394,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_xor_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4412,10 +4412,10 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4449,10 +4449,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_xor_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4466,12 +4466,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_xor_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1]
@@ -4506,36 +4506,36 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_xor_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4564,26 +4564,26 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -4613,24 +4613,24 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4665,12 +4665,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -4678,20 +4678,20 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
@@ -4728,31 +4728,31 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
@@ -4783,23 +4783,23 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: s_add_u32 s0, s2, 32
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: s_addc_u32 s1, s3, 0
+; GCN2-NEXT: s_add_u32 s0, s6, 32
+; GCN2-NEXT: s_addc_u32 s1, s7, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -4822,21 +4822,21 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4865,10 +4865,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
; GCN2-LABEL: atomic_store_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -4882,14 +4882,14 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
; GFX12-LABEL: atomic_store_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -4918,10 +4918,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
; GCN2-LABEL: atomic_store_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4933,14 +4933,14 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
; GFX12-LABEL: atomic_store_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4971,16 +4971,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN2-LABEL: atomic_cmpxchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 32
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4990,11 +4990,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5027,16 +5027,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GCN2-LABEL: atomic_cmpxchg_i64_soffset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 0x11940
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_add_u32 s0, s4, 0x11940
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5046,11 +5046,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5084,35 +5084,35 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
;
; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5146,18 +5146,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
;
; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5166,12 +5166,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5212,19 +5212,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: s_add_u32 s0, s4, s2
-; GCN2-NEXT: s_addc_u32 s3, s5, s3
-; GCN2-NEXT: s_add_u32 s2, s0, 32
-; GCN2-NEXT: s_addc_u32 s3, s3, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: s_add_u32 s0, s0, 32
+; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: v_mov_b32_e32 v1, s9
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5237,13 +5237,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5279,14 +5279,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GCN2-LABEL: atomic_cmpxchg_i64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v4, s4
; GCN2-NEXT: v_mov_b32_e32 v5, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5296,11 +5296,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5331,33 +5331,33 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
;
; GCN2-LABEL: atomic_cmpxchg_i64_ret:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5388,16 +5388,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
;
; GCN2-LABEL: atomic_cmpxchg_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5406,12 +5406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5449,17 +5449,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GCN2-NEXT: s_add_u32 s2, s4, s2
-; GCN2-NEXT: s_addc_u32 s3, s5, s3
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: v_mov_b32_e32 v1, s9
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5472,13 +5472,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5512,26 +5512,26 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -5561,24 +5561,24 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5613,12 +5613,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_f64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5626,20 +5626,20 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
@@ -5676,31 +5676,31 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_f64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
@@ -5731,23 +5731,23 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: s_add_u32 s0, s2, 32
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: s_addc_u32 s1, s3, 0
+; GCN2-NEXT: s_add_u32 s0, s6, 32
+; GCN2-NEXT: s_addc_u32 s1, s7, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -5770,21 +5770,21 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_f64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5813,10 +5813,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
; GCN2-LABEL: atomic_store_f64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -5830,14 +5830,14 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
; GFX12-LABEL: atomic_store_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -5866,10 +5866,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
; GCN2-LABEL: atomic_store_f64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -5881,14 +5881,14 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
; GFX12-LABEL: atomic_store_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5915,13 +5915,13 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_inc_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5930,10 +5930,10 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_inc_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5966,20 +5966,20 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_inc_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -5987,10 +5987,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6027,10 +6027,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_inc_i64_incr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -6046,12 +6046,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_inc_i64_incr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
@@ -6089,38 +6089,38 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6147,12 +6147,12 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_inc_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6160,10 +6160,10 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_inc_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6194,12 +6194,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_inc_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6212,10 +6212,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6249,10 +6249,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_inc_i64_incr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -6266,12 +6266,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_inc_i64_incr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1]
@@ -6306,36 +6306,36 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_inc_i64_ret_incr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6363,13 +6363,13 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_dec_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6378,10 +6378,10 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_dec_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6414,20 +6414,20 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_dec_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -6435,10 +6435,10 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6475,10 +6475,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_dec_i64_decr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -6494,12 +6494,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_dec_i64_decr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
@@ -6537,38 +6537,38 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6595,12 +6595,12 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_dec_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6608,10 +6608,10 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_dec_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6642,12 +6642,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_dec_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6660,10 +6660,10 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6697,10 +6697,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_dec_i64_decr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -6714,12 +6714,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_dec_i64_decr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1]
@@ -6754,36 +6754,36 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_dec_i64_ret_decr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index d812b4b..19601b1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -4292,24 +4292,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
;
; GCN2-LABEL: atomic_max_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4402,25 +4402,25 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4432,30 +4432,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB89_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -4514,22 +4514,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
;
; GCN2-LABEL: atomic_max_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4619,23 +4619,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4647,30 +4647,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB91_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -5674,24 +5674,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
;
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -5784,25 +5784,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5814,30 +5814,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN3-NEXT: s_cbranch_execnz .LBB103_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -5899,23 +5899,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5927,30 +5927,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN3-NEXT: s_cbranch_execnz .LBB104_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -7898,24 +7898,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
;
; GCN2-LABEL: atomic_min_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8008,25 +8008,25 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8038,30 +8038,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB126_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -8118,20 +8118,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8139,29 +8139,29 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB127_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v6, s3
-; GCN3-NEXT: v_mov_b32_e32 v7, s2
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8169,9 +8169,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB127_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -8218,23 +8218,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8246,30 +8246,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB128_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 2a9a9ef..7bbbb6d 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -948,12 +948,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -967,12 +967,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1041,12 +1041,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1060,12 +1060,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1133,12 +1133,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1153,12 +1153,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1226,12 +1226,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1246,12 +1246,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1319,12 +1319,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1339,12 +1339,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1412,12 +1412,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1432,12 +1432,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1505,12 +1505,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1525,12 +1525,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1598,12 +1598,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1618,12 +1618,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1691,12 +1691,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1711,12 +1711,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1784,12 +1784,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1804,12 +1804,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1877,12 +1877,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1897,12 +1897,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1970,12 +1970,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1990,12 +1990,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -2081,13 +2081,13 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
;
; GFX11-NOFMA-LABEL: test_f32_interp:
; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[10:11]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
@@ -2095,26 +2095,26 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NOFMA-NEXT: s_nop 0
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NOFMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: test_f32_interp:
; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[10:11]
+; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[6:7]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -2165,13 +2165,13 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
;
; GFX11-NOFMA-LABEL: test_f64_interp:
; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3]
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[10:11]
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[8:9]
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
@@ -2179,26 +2179,26 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
-; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[4:5]
; GFX11-NOFMA-NEXT: s_nop 0
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NOFMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: test_f64_interp:
; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7]
-; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3]
+; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[8:9]
+; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[10:11]
+; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[6:7]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
-; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[4:5]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -2236,15 +2236,15 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2282,15 +2282,15 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX11-LABEL: fma_2.0_neg_a_b_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2333,19 +2333,19 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac
;
; GFX11-LABEL: fma_neg_b_c_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3]
-; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48
+; GFX11-NEXT: global_load_b128 v[0:3], v12, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[4:7], v12, s[6:7]
+; GFX11-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:48
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3
; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2
; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1
; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0
-; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 23eb730..36d917f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -37,92 +37,92 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max3_f32 v0, v0, v1, v2
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -169,92 +169,92 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max3_f32 v0, v2, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -304,96 +304,96 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_max_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -444,96 +444,96 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index 01b2f20..35621f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -28,15 +28,15 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_uge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -80,15 +80,15 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_oge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -132,15 +132,15 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_ugt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -184,15 +184,15 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_ogt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 87ac95a..a8815c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -263,12 +263,12 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_maximum_f32 v1, v1, v2
; GCN-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -287,12 +287,12 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_u16 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_maximum_f16 v1, v1, v2
; GCN-NEXT: global_store_b16 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 764fb99..4543038 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -45,15 +45,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -63,16 +63,16 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -83,27 +83,27 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -156,15 +156,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -174,16 +174,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -194,27 +194,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -268,15 +268,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -286,16 +286,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -306,27 +306,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -380,15 +380,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -398,16 +398,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -418,27 +418,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -494,15 +494,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -513,17 +513,17 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -534,28 +534,28 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v1
; GFX9-NEXT: v_min_f32_e32 v1, 2.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -617,15 +617,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -639,16 +639,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -663,50 +663,50 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1
; GFX9-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1
; GFX11-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -714,18 +714,18 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v2, v1, 2.0, 4.0
; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -783,15 +783,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -802,16 +802,16 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -823,29 +823,29 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_fmed3_r_i_i_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 2.0
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_r_i_i_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -897,15 +897,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_med3_f32 v2, v3, 2.0, 4.0
@@ -914,16 +914,16 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -933,24 +933,24 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1004,15 +1004,15 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -1022,16 +1022,16 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1045,52 +1045,52 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc
; GFX9-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc, 4.0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -1098,7 +1098,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1170,17 +1170,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1188,8 +1188,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1197,19 +1197,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1218,8 +1218,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -1229,67 +1229,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1360,17 +1360,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1378,8 +1378,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, -v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1387,19 +1387,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1408,8 +1408,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2
@@ -1419,67 +1419,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1550,17 +1550,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1568,8 +1568,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, -v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1577,19 +1577,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1598,8 +1598,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3
@@ -1609,67 +1609,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1741,17 +1741,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1759,8 +1759,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, |v2|, -|v3|
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1768,19 +1768,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1789,8 +1789,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -1801,69 +1801,69 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1942,17 +1942,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1960,8 +1960,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -|v7|, -|v2|, -|v3|
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1969,19 +1969,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1990,8 +1990,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v7|
@@ -2003,71 +2003,71 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -2151,17 +2151,17 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2169,8 +2169,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -2181,19 +2181,19 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2202,8 +2202,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -2215,38 +2215,38 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2320,17 +2320,17 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2338,8 +2338,8 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2347,19 +2347,19 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2368,8 +2368,8 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2378,32 +2378,32 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2472,17 +2472,17 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2490,8 +2490,8 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2499,19 +2499,19 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2520,8 +2520,8 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2530,32 +2530,32 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_nnan_call_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_call_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2624,17 +2624,17 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2642,8 +2642,8 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2651,19 +2651,19 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2672,8 +2672,8 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2682,32 +2682,32 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_fast_call_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_fast_call_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2788,17 +2788,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2806,8 +2806,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2815,19 +2815,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2836,8 +2836,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2846,32 +2846,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2940,17 +2940,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2958,8 +2958,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2967,19 +2967,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2988,8 +2988,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2998,32 +2998,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3093,17 +3093,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3111,8 +3111,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3120,19 +3120,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3141,8 +3141,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -3152,67 +3152,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3282,17 +3282,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3300,8 +3300,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3309,19 +3309,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3330,8 +3330,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -3340,32 +3340,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3434,17 +3434,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3452,8 +3452,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3461,19 +3461,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3482,8 +3482,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -3492,32 +3492,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3586,17 +3586,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3604,8 +3604,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3613,19 +3613,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3634,8 +3634,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3644,32 +3644,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3738,17 +3738,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3756,8 +3756,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3765,19 +3765,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3786,8 +3786,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3796,32 +3796,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3890,17 +3890,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3908,8 +3908,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3917,19 +3917,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3938,8 +3938,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3948,32 +3948,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4042,17 +4042,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4060,8 +4060,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4069,19 +4069,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4090,8 +4090,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4100,32 +4100,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4194,17 +4194,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4212,8 +4212,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4221,19 +4221,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4242,8 +4242,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -4252,32 +4252,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4346,17 +4346,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4364,8 +4364,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4373,19 +4373,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4394,8 +4394,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4404,32 +4404,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4498,17 +4498,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4516,8 +4516,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4525,19 +4525,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4546,8 +4546,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -4556,32 +4556,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4650,17 +4650,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4668,8 +4668,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4677,19 +4677,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4698,8 +4698,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4708,32 +4708,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4802,17 +4802,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4820,8 +4820,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4829,19 +4829,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4850,8 +4850,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4860,32 +4860,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4954,17 +4954,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4972,8 +4972,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4981,19 +4981,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5002,8 +5002,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -5012,32 +5012,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5106,17 +5106,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5124,8 +5124,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5133,19 +5133,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5154,8 +5154,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -5164,32 +5164,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5258,17 +5258,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5276,8 +5276,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5285,19 +5285,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5306,8 +5306,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -5316,32 +5316,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5413,17 +5413,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5431,8 +5431,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5440,19 +5440,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5461,8 +5461,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -5471,32 +5471,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5588,17 +5588,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5606,8 +5606,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -5623,19 +5623,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5644,8 +5644,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -5662,14 +5662,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -5680,19 +5680,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -5701,7 +5701,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4
; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5790,17 +5790,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5808,8 +5808,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -5825,19 +5825,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5846,8 +5846,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -5864,14 +5864,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -5882,19 +5882,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5904,21 +5904,21 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -5927,7 +5927,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX11-GISEL-NEXT: v_minmax_f32 v2, v1, v2, v4
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -6016,17 +6016,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6034,8 +6034,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -6051,19 +6051,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6072,8 +6072,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -6090,14 +6090,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6108,19 +6108,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-NEXT: v_max_f32_e32 v3, v3, v3
@@ -6129,7 +6129,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3
; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6210,26 +6210,26 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s10, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v6
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -6243,19 +6243,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
@@ -6264,8 +6264,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -6280,14 +6280,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6296,47 +6296,47 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
; GFX9-NEXT: v_max_f32_e32 v2, v3, v3
; GFX9-NEXT: v_min_f32_e32 v1, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3
; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -6411,17 +6411,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6429,8 +6429,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6441,19 +6441,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6462,8 +6462,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6475,38 +6475,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6586,17 +6586,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6604,8 +6604,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6616,19 +6616,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6637,8 +6637,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6650,38 +6650,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6761,17 +6761,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6779,8 +6779,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6791,19 +6791,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6812,8 +6812,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6825,38 +6825,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6931,17 +6931,17 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6949,8 +6949,8 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -6958,19 +6958,19 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6979,8 +6979,8 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -6990,67 +6990,67 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7126,26 +7126,26 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s10, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_min_f32_e64 v4, -v6, v2
; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2
@@ -7156,19 +7156,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
@@ -7177,8 +7177,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -7191,77 +7191,77 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_min_f32_e64 v4, -v1, v2
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2
; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_minmax_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v4, v2
; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7334,17 +7334,17 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -7352,8 +7352,8 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_max_f32_e32 v2, v7, v2
; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3
@@ -7362,19 +7362,19 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -7383,8 +7383,8 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_max_f32_e32 v2, v7, v2
@@ -7394,33 +7394,33 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_global_nnans_min_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_min_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7487,15 +7487,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -7506,17 +7506,17 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -7527,27 +7527,27 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7644,17 +7644,17 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_ushort v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -7662,8 +7662,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_ushort v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f16_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f16_e32 v2, 2.0, v2
@@ -7677,19 +7677,19 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_ushort v7, v[0:1] glc
@@ -7698,8 +7698,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_ushort v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f16_e32 v4, 1.0, v7
@@ -7714,39 +7714,39 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f16_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f16_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f16 v1, v1, v2, v3
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_u16 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: v_add_f16_e32 v2, 2.0, v2
; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7810,15 +7810,15 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: two_non_inline_constant:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7829,17 +7829,17 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: two_non_inline_constant:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7850,45 +7850,45 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: two_non_inline_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: two_non_inline_constant:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s0, 0x41800000
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: two_non_inline_constant:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7952,16 +7952,16 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: one_non_inline_constant:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7974,17 +7974,17 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: one_non_inline_constant:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -7998,32 +7998,32 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: one_non_inline_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1
; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: one_non_inline_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1
; GFX11-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_med3_f32 v2, v2, 1.0, 0x41800000
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -8099,21 +8099,21 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: two_non_inline_constant_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: s_mov_b32 s2, 0x41000000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
-; VI-SDAG-NEXT: v_med3_f32 v2, v2, s2, v4
+; VI-SDAG-NEXT: v_med3_f32 v2, v2, s0, v4
; VI-SDAG-NEXT: v_add_f32_e32 v5, 0x41800000, v3
; VI-SDAG-NEXT: v_add_f32_e32 v3, 0x41000000, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -8125,18 +8125,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: two_non_inline_constant_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -8153,18 +8153,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v4, 0x41800000, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2
-; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s0, v2
+; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off
@@ -8173,18 +8173,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v5, off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off
@@ -8193,18 +8193,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 0x41800000, v1
; GFX11-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v1
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s2, 0x41800000
-; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s0, 0x41800000
+; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v1, off dlc
@@ -8215,17 +8215,17 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0x41800000 :: v_dual_add_f32 v3, 0.5, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2
; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 7337d90..2d17955 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -37,92 +37,92 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_min3_f32 v0, v0, v1, v2
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -169,92 +169,92 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_min3_f32 v0, v2, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -304,96 +304,96 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_min_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -444,96 +444,96 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -646,103 +646,103 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s12, s10
+; GFX11-NEXT: s_mov_b32 s13, s11
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -793,103 +793,103 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; VI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s12, s10
+; GFX11-NEXT: s_mov_b32 s13, s11
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index d20c39d..1620ecf 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -26,15 +26,15 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_uge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -78,15 +78,15 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ugt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -130,15 +130,15 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ule_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -182,15 +182,15 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ult_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -234,15 +234,15 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_oge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -286,15 +286,15 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ogt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -338,15 +338,15 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ole_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -390,15 +390,15 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_olt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 45f6bff..0464b9a 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -263,12 +263,12 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_minimum_f32 v1, v1, v2
; GCN-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -287,12 +287,12 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_u16 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_minimum_f16 v1, v1, v2
; GCN-NEXT: global_store_b16 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 98faaac..384ea30 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -58,24 +58,24 @@ define amdgpu_kernel void @fmul_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -113,38 +113,38 @@ define amdgpu_kernel void @fmul_f16_imm_a(
;
; GFX89-LABEL: fmul_f16_imm_a:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,38 +180,38 @@ define amdgpu_kernel void @fmul_f16_imm_b(
;
; GFX89-LABEL: fmul_f16_imm_b:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,23 +309,23 @@ define amdgpu_kernel void @fmul_v2f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,60 +369,60 @@ define amdgpu_kernel void @fmul_v2f16_imm_a(
;
; VI-LABEL: fmul_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4400
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x44004200
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x44004200
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,60 +464,60 @@ define amdgpu_kernel void @fmul_v2f16_imm_b(
;
; VI-LABEL: fmul_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4200
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x42004400
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x42004400
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -631,24 +631,24 @@ define amdgpu_kernel void @fmul_v4f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1
; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -701,18 +701,18 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
;
; VI-LABEL: fmul_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1
@@ -720,47 +720,47 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s2, 0x44004200
-; GFX9-NEXT: s_mov_b32 s3, 0x40004800
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s6, 0x44004200
+; GFX9-NEXT: s_mov_b32 s7, 0x40004800
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, s6
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s7
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1
; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index bde0dc3..25ec5b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -22,19 +22,19 @@ declare half @llvm.fabs.f16(half) #1
define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmuladd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -42,19 +42,19 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-DENORM-LABEL: fmuladd_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
@@ -62,65 +62,65 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-FLUSH-LABEL: fmuladd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -136,19 +136,19 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmul_fadd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -156,19 +156,19 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
@@ -176,100 +176,100 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-FLUSH-LABEL: fmul_fadd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-STRICT-NEXT: s_clause 0x2
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-STRICT-NEXT: s_clause 0x2
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -286,19 +286,19 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmul_fadd_contract_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -306,19 +306,19 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
;
; VI-DENORM-LABEL: fmul_fadd_contract_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
@@ -326,65 +326,65 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
;
; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -401,11 +401,11 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -419,11 +419,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -437,59 +437,59 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -509,11 +509,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -527,11 +527,11 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -545,59 +545,59 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -617,11 +617,11 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
; VI-FLUSH-LABEL: fadd_a_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -635,11 +635,11 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
;
; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -653,90 +653,90 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
;
; GFX10-FLUSH-LABEL: fadd_a_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -759,11 +759,11 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
; VI-FLUSH-LABEL: fadd_b_a_a_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -777,11 +777,11 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
;
; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -795,90 +795,90 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
;
; GFX10-FLUSH-LABEL: fadd_b_a_a_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -901,11 +901,11 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -919,11 +919,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -937,59 +937,59 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1009,11 +1009,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1027,11 +1027,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
;
; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1045,59 +1045,59 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
;
; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1119,11 +1119,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1137,11 +1137,11 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1155,59 +1155,59 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1229,11 +1229,11 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1247,11 +1247,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1265,59 +1265,59 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -2358,11 +2358,11 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2376,11 +2376,11 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2394,90 +2394,90 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -2499,11 +2499,11 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2517,11 +2517,11 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2535,90 +2535,90 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index ce5bb66..997db91 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -41,24 +41,24 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
;
; VI-LABEL: fnearbyint_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f16_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rndne_f16_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f16_e32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_rndne_f16_e32 v1, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -81,24 +81,24 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
;
; VI-LABEL: fnearbyint_f32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rndne_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f32_e32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_rndne_f32_e32 v1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,14 +168,14 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-LABEL: fnearbyint_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_rndne_f32_e32 v2, s6
; VI-NEXT: v_rndne_f32_e32 v1, s5
; VI-NEXT: v_rndne_f32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -183,14 +183,14 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v3, s7
; GFX11-NEXT: v_rndne_f32_e32 v2, s6
; GFX11-NEXT: v_rndne_f32_e32 v1, s5
; GFX11-NEXT: v_rndne_f32_e32 v0, s4
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -237,21 +237,21 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: nearbyint_f64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_rndne_f64_e32 v[0:1], s[6:7]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: nearbyint_f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[6:7]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,12 +309,12 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; VI-LABEL: nearbyint_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -322,12 +322,12 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -406,18 +406,18 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-LABEL: nearbyint_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9]
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v11, s3
-; VI-NEXT: v_mov_b32_e32 v9, s1
-; VI-NEXT: v_mov_b32_e32 v10, s2
-; VI-NEXT: v_mov_b32_e32 v8, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v11, s1
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v10, s0
+; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; VI-NEXT: s_endpgm
@@ -426,7 +426,7 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
@@ -434,8 +434,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 277dc01..c19f7d1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3036,21 +3036,21 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
;
; VI-LABEL: s_fneg_select_infloop_regression_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s4, 0
-; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: s_bitcmp1_b32 s6, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; VI-NEXT: s_and_b64 s[6:7], s[0:1], exec
+; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[0:1]
; VI-NEXT: s_cselect_b32 s2, 0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, double 0.0, double %arg
@@ -3096,17 +3096,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
;
; VI-LABEL: s_fneg_select_infloop_regression_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; VI-NEXT: s_bitcmp1_b32 s4, 16
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, half 0.0, half %arg
@@ -3236,19 +3236,19 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
; VI-LABEL: s_fneg_select_infloop_regression_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s6, 0
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3]
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 2c9042e..e3d3fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -21,11 +21,11 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x,
; VI-LABEL: fneg_fabs_fadd_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|v[0:1]|
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,13 +52,13 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: v_fneg_fabs_fadd_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|s[2:3]|
+; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|s[0:1]|
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%x = load double, ptr addrspace(1) %xptr, align 8
@@ -89,11 +89,11 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x,
; VI-LABEL: fneg_fabs_fmul_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mul_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_mul_f64 v[0:1], s[2:3], -|v[0:1]|
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -122,12 +122,12 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: fneg_fabs_free_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_or_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -155,12 +155,12 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: fneg_fabs_fn_free_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_or_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -188,13 +188,13 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl
; VI-LABEL: fneg_fabs_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call double @llvm.fabs.f64(double %in)
@@ -223,16 +223,16 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %
; VI-LABEL: fneg_fabs_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s2, s7, 0x80000000
-; VI-NEXT: s_or_b32 s3, s5, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: s_or_b32 s1, s5, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
@@ -268,27 +268,27 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-LABEL: fneg_fabs_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s7, 31
; VI-NEXT: s_bitset1_b32 s5, 31
-; VI-NEXT: s_or_b32 s2, s11, 0x80000000
-; VI-NEXT: s_or_b32 s3, s9, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_or_b32 s0, s11, 0x80000000
+; VI-NEXT: s_or_b32 s1, s9, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 32033c5..2a1ca0f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -79,13 +79,13 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fneg_fabsf_free_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc = bitcast i32 %in to float
@@ -141,13 +141,13 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: fneg_fabsf_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
@@ -177,13 +177,13 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: v_fneg_fabsf_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -251,18 +251,18 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-LABEL: fneg_fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s2, s7, 0x80000000
-; VI-NEXT: s_or_b32 s3, s6, 0x80000000
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: s_or_b32 s1, s6, 0x80000000
; VI-NEXT: s_bitset1_b32 s5, 31
; VI-NEXT: s_bitset1_b32 s4, 31
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 94fc929..66b5cad 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -19,26 +19,26 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: s_fneg_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -116,18 +116,18 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; VI-LABEL: s_fneg_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s7, 0x80000000
-; VI-NEXT: s_xor_b32 s3, s6, 0x80000000
+; VI-NEXT: s_xor_b32 s0, s7, 0x80000000
+; VI-NEXT: s_xor_b32 s1, s6, 0x80000000
; VI-NEXT: s_xor_b32 s5, s5, 0x80000000
; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -135,17 +135,17 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000
-; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s7, 0x80000000
+; GFX11-NEXT: s_xor_b32 s1, s6, 0x80000000
; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,24 +168,24 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fsub0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f32_e64 v2, 0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f32_e64 v2, 0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fsub0_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -210,26 +210,26 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fneg_free_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_free_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,24 +253,24 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: fneg_fold_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, -s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, -s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fold_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -295,24 +295,24 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in
;
; VI-LABEL: bitpreserve_fneg_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: bitpreserve_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -339,26 +339,26 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_fneg_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -391,24 +391,24 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_fneg_i32_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i32_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -448,25 +448,25 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: s_fneg_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -500,21 +500,21 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: s_fneg_i64_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_f64 v[0:1], -s[6:7], 2.0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i64_fp_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_add_f64 v[0:1], -s[6:7], 2.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -563,24 +563,24 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
;
; VI-LABEL: s_fneg_i16_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i16_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -631,31 +631,31 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
;
; VI-LABEL: s_fneg_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: s_xor_b32 s3, s3, 0x8000
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_xor_b32 s1, s4, 0x8000
+; VI-NEXT: s_xor_b32 s0, s0, 0x8000
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80008000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -714,31 +714,31 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
;
; VI-LABEL: s_fneg_v2i16_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: s_xor_b32 s3, s3, 0x8000
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_f16_e64 v1, s2, 2.0
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_xor_b32 s0, s0, 0x8000
+; VI-NEXT: s_xor_b32 s1, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_f16_e64 v1, s1, 2.0
; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2i16_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 7f87b41..157b748 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -37,10 +37,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -49,10 +49,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -76,10 +76,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -88,10 +88,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index ca2fa0f..afca450 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -22,28 +22,28 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
;
; VI-LABEL: test_isinf_pattern:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isinf_pattern:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -70,28 +70,28 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
;
; VI-LABEL: test_not_isinf_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s4|, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_not_isinf_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x7f800000, |s4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -115,20 +115,20 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %
;
; VI-LABEL: test_not_isinf_pattern_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_not_isinf_pattern_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -155,28 +155,28 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -205,28 +205,28 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,27 +253,27 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_not_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -306,31 +306,31 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s4, s4
; VI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s4
+; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -417,31 +417,31 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_3:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_u_f32_e64 s[0:1], s4, s4
; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s4|, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, s4, s4
+; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, |s4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -470,28 +470,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -520,28 +520,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
;
; VI-LABEL: test_isfinite_pattern_4_commute_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4_commute_and:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -592,17 +592,17 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x50
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s1, s4, 0x1f8
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -632,28 +632,28 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
;
; VI-LABEL: test_isinf_pattern_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isinf_pattern_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x204
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -684,28 +684,28 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_pattern_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -738,28 +738,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_pattern_4_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 2928647..0b49b73 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -52,22 +52,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
-; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
@@ -75,11 +75,11 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; GFX12-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -118,22 +118,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
@@ -318,13 +318,13 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[8:10], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s0, 4
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[4:7], s0 offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT: v_mov_b32_e32 v1, s10
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
@@ -444,22 +444,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
-; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
@@ -467,11 +467,11 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; GFX12-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -510,22 +510,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index f4745a5..c35da12 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -104,22 +104,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
@@ -462,22 +462,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index bf3dbec..2663bd0 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1022,22 +1022,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1047,22 +1047,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1072,22 +1072,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1097,35 +1097,35 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
@@ -1139,14 +1139,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB39_2:
@@ -1166,13 +1166,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB40_2:
@@ -1187,14 +1187,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB40_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB40_2:
@@ -1207,35 +1207,35 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB41_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
@@ -1249,14 +1249,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB41_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB42_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB42_2:
@@ -1297,14 +1297,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB42_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB42_2:
@@ -1479,33 +1479,33 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB49_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
@@ -1519,14 +1519,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB49_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB49_2:
@@ -1564,10 +1564,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1581,11 +1581,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1593,10 +1593,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1636,10 +1636,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1760,23 +1760,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1829,10 +1829,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1846,23 +1846,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1892,23 +1892,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index a058c11..f710456 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -54,14 +54,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -104,14 +104,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -341,14 +341,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index 046c92a..f308174 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -104,14 +104,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -341,14 +341,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index b4fee70..facb3e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -20,12 +20,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) {
; VI-LABEL: fp_to_sint_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i32:
@@ -59,12 +59,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in)
; VI-LABEL: fp_to_sint_i32_fabs:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2|
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i32_fabs:
@@ -147,17 +147,17 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: fp_to_sint_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_i32_f32_e32 v3, s7
-; VI-NEXT: v_cvt_i32_f32_e32 v2, s6
-; VI-NEXT: v_cvt_i32_f32_e32 v1, s5
-; VI-NEXT: v_cvt_i32_f32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: v_cvt_i32_f32_e32 v3, s3
+; VI-NEXT: v_cvt_i32_f32_e32 v2, s2
+; VI-NEXT: v_cvt_i32_f32_e32 v1, s1
+; VI-NEXT: v_cvt_i32_f32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v4i32:
@@ -217,24 +217,24 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) {
; VI-LABEL: fp_to_sint_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s4, 0x2f800000
-; VI-NEXT: s_mov_b32 s5, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0x2f800000
+; VI-NEXT: s_mov_b32 s1, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s2
-; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4
+; VI-NEXT: v_mul_f32_e64 v1, |v0|, s0
; VI-NEXT: v_floor_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, v1, s5, |v0|
+; VI-NEXT: v_fma_f32 v2, v1, s1, |v0|
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_xor_b32_e32 v0, v2, v3
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i64:
@@ -509,24 +509,24 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fp_to_sint_v4i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s8, 0x2f800000
-; VI-NEXT: s_mov_b32 s9, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0x2f800000
+; VI-NEXT: s_mov_b32 s1, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_trunc_f32_e32 v0, s5
-; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8
+; VI-NEXT: v_trunc_f32_e32 v0, s9
+; VI-NEXT: v_mul_f32_e64 v1, |v0|, s0
; VI-NEXT: v_floor_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, v1, s9, |v0|
+; VI-NEXT: v_fma_f32 v2, v1, s1, |v0|
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
-; VI-NEXT: v_trunc_f32_e32 v4, s4
+; VI-NEXT: v_trunc_f32_e32 v4, s8
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
-; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8
+; VI-NEXT: v_mul_f32_e64 v3, |v4|, s0
; VI-NEXT: v_floor_f32_e32 v3, v3
; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; VI-NEXT: v_cvt_u32_f32_e32 v5, v3
-; VI-NEXT: v_fma_f32 v3, v3, s9, |v4|
+; VI-NEXT: v_fma_f32 v3, v3, s1, |v4|
; VI-NEXT: v_xor_b32_e32 v2, v2, v0
; VI-NEXT: v_cvt_u32_f32_e32 v6, v3
; VI-NEXT: v_xor_b32_e32 v1, v1, v0
@@ -534,22 +534,22 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4
; VI-NEXT: v_xor_b32_e32 v4, v5, v1
-; VI-NEXT: v_trunc_f32_e32 v5, s7
+; VI-NEXT: v_trunc_f32_e32 v5, s11
; VI-NEXT: v_xor_b32_e32 v0, v6, v1
-; VI-NEXT: v_mul_f32_e64 v6, |v5|, s8
+; VI-NEXT: v_mul_f32_e64 v6, |v5|, s0
; VI-NEXT: v_floor_f32_e32 v6, v6
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
-; VI-NEXT: v_fma_f32 v6, v6, s9, |v5|
+; VI-NEXT: v_fma_f32 v6, v6, s1, |v5|
; VI-NEXT: v_cvt_u32_f32_e32 v6, v6
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; VI-NEXT: v_ashrrev_i32_e32 v4, 31, v5
-; VI-NEXT: v_trunc_f32_e32 v8, s6
+; VI-NEXT: v_trunc_f32_e32 v8, s10
; VI-NEXT: v_xor_b32_e32 v5, v6, v4
-; VI-NEXT: v_mul_f32_e64 v6, |v8|, s8
+; VI-NEXT: v_mul_f32_e64 v6, |v8|, s0
; VI-NEXT: v_floor_f32_e32 v6, v6
; VI-NEXT: v_cvt_u32_f32_e32 v9, v6
-; VI-NEXT: v_fma_f32 v6, v6, s9, |v8|
+; VI-NEXT: v_fma_f32 v6, v6, s1, |v8|
; VI-NEXT: v_cvt_u32_f32_e32 v10, v6
; VI-NEXT: v_xor_b32_e32 v7, v7, v4
; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4
@@ -558,10 +558,10 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: v_xor_b32_e32 v4, v10, v5
; VI-NEXT: v_xor_b32_e32 v8, v9, v5
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_subb_u32_e32 v5, vcc, v8, v5, vcc
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v4i64:
@@ -749,14 +749,14 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
;
; VI-LABEL: fp_to_uint_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i1:
@@ -799,14 +799,14 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
;
; VI-LABEL: fp_to_uint_fabs_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4|
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, |s2|
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_fabs_f32_to_i1:
@@ -850,12 +850,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in)
; VI-LABEL: fp_to_sint_f32_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_f32_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index f8ede1c..364e8ca 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -20,12 +20,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %
; VI-LABEL: fp_to_uint_f32_to_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i32:
@@ -107,17 +107,17 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_u32_f32_e32 v3, s7
-; VI-NEXT: v_cvt_u32_f32_e32 v2, s6
-; VI-NEXT: v_cvt_u32_f32_e32 v1, s5
-; VI-NEXT: v_cvt_u32_f32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: v_cvt_u32_f32_e32 v3, s3
+; VI-NEXT: v_cvt_u32_f32_e32 v2, s2
+; VI-NEXT: v_cvt_u32_f32_e32 v1, s1
+; VI-NEXT: v_cvt_u32_f32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i32:
@@ -170,18 +170,18 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x
; VI-LABEL: fp_to_uint_f32_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xcf800000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s2
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_floor_f32_e32 v2, v1
-; VI-NEXT: v_fma_f32 v0, v2, s3, v0
+; VI-NEXT: v_fma_f32 v0, v2, s0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i64:
@@ -412,38 +412,38 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s2, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_trunc_f32_e32 v0, s5
-; VI-NEXT: v_trunc_f32_e32 v4, s4
+; VI-NEXT: v_trunc_f32_e32 v0, s9
+; VI-NEXT: v_trunc_f32_e32 v4, s8
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: v_floor_f32_e32 v6, v2
-; VI-NEXT: v_fma_f32 v0, v5, s2, v0
+; VI-NEXT: v_fma_f32 v0, v5, s0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v2, v0
-; VI-NEXT: v_fma_f32 v0, v6, s2, v4
-; VI-NEXT: v_trunc_f32_e32 v4, s7
+; VI-NEXT: v_fma_f32 v0, v6, s0, v4
+; VI-NEXT: v_trunc_f32_e32 v4, s11
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; VI-NEXT: v_trunc_f32_e32 v8, s6
+; VI-NEXT: v_trunc_f32_e32 v8, s10
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_floor_f32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8
; VI-NEXT: v_floor_f32_e32 v9, v5
-; VI-NEXT: v_fma_f32 v4, v6, s2, v4
+; VI-NEXT: v_fma_f32 v4, v6, s0, v4
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
; VI-NEXT: v_cvt_u32_f32_e32 v6, v4
-; VI-NEXT: v_fma_f32 v4, v9, s2, v8
+; VI-NEXT: v_fma_f32 v4, v9, s0, v8
; VI-NEXT: v_cvt_u32_f32_e32 v5, v9
; VI-NEXT: v_cvt_u32_f32_e32 v4, v4
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
@@ -631,14 +631,14 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
;
; VI-LABEL: fp_to_uint_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i1:
@@ -681,14 +681,14 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
;
; VI-LABEL: fp_to_uint_fabs_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4|
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, |s2|
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_fabs_f32_to_i1:
@@ -732,12 +732,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i
; VI-LABEL: fp_to_uint_f32_to_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 82c25c0..2c74b3d 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fpext_f16_to_f32(
;
; GFX89-LABEL: fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fpext_f16_to_f64(
;
; GFX89-LABEL: fpext_f16_to_f64:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_f16_to_f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -161,42 +161,42 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
;
; GFX89-LABEL: fpext_v2f16_to_v2f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_v2f16_to_v2f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -234,38 +234,38 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
;
; GFX89-LABEL: fpext_v2f16_to_v2f64:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_v2f16_to_v2f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -274,7 +274,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -299,38 +299,27 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
-; VI-LABEL: s_fneg_fpext_f16_to_f32:
-; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT: s_endpgm
-;
-; GFX9-LABEL: s_fneg_fpext_f16_to_f32:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: s_fneg_fpext_f16_to_f32:
+; GFX89: ; %bb.0: ; %entry
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -363,38 +352,38 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -429,38 +418,38 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -495,38 +484,38 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -568,45 +557,45 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -649,45 +638,45 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -729,45 +718,45 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -810,45 +799,45 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -890,45 +879,45 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -972,45 +961,45 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1031,3 +1020,6 @@ entry:
declare half @llvm.fabs.f16(half) #1
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 238010e..ca58708 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
;
; VI-LABEL: fptosi_f16_to_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_i16_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
;
; VI-LABEL: fptosi_f16_to_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -164,43 +164,43 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
;
; VI-LABEL: fptosi_f16_to_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -241,37 +241,37 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
;
; VI-LABEL: fptosi_v2f16_to_v2i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_i16_f16_e32 v1, v0
; VI-NEXT: v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
@@ -280,7 +280,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -318,38 +318,38 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
;
; VI-LABEL: fptosi_v2f16_to_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_i32_f32_e32 v0, v1
; VI-NEXT: v_cvt_i32_f32_e32 v1, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -358,7 +358,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -401,17 +401,17 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
;
; VI-LABEL: fptosi_v2f16_to_v2i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -419,22 +419,22 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
; VI-NEXT: v_cvt_i32_f32_e32 v2, v2
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -446,7 +446,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -475,28 +475,28 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
;
; VI-LABEL: fptosi_f16_to_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], -1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f16_e64 s[0:1], -1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s0, -1.0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 1116dc9..2d5ae03 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
;
; VI-LABEL: fptoui_f16_to_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_u16_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
;
; VI-LABEL: fptoui_f16_to_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -164,43 +164,43 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
;
; VI-LABEL: fptoui_f16_to_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -240,37 +240,37 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
;
; VI-LABEL: fptoui_v2f16_to_v2i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_u16_f16_e32 v1, v0
; VI-NEXT: v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
@@ -279,7 +279,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,38 +317,38 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
;
; VI-LABEL: fptoui_v2f16_to_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_u32_f32_e32 v0, v1
; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -357,7 +357,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -400,17 +400,17 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
;
; VI-LABEL: fptoui_v2f16_to_v2i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -418,22 +418,22 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -444,7 +444,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -473,28 +473,28 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
;
; VI-LABEL: fptoui_f16_to_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f16_e64 s[0:1], 1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s0, 1.0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 6cc7368..3873036 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -41,94 +41,94 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -176,102 +176,102 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
;
; VI-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -323,109 +323,109 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
;
; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -481,93 +481,93 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
;
; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
@@ -577,27 +577,27 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -643,94 +643,94 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -777,94 +777,94 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -911,94 +911,94 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1046,98 +1046,98 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1185,98 +1185,98 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
;
; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1327,102 +1327,102 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX9-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index e4aa4d1..bcef7bc 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -28,66 +28,66 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
;
; VI-SDAG-LABEL: fptrunc_f64_to_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f64_to_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_f64_to_f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_f64_to_f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f64_to_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f64_to_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -218,358 +218,356 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
;
; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-SAFE-GISEL: ; %bb.0:
-; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; VI-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; VI-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; VI-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; VI-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; VI-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
+; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; VI-SAFE-GISEL-NEXT: s_sub_i32 s7, 1, s4
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s4, 12
-; VI-SAFE-GISEL-NEXT: s_max_i32 s7, s7, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s2, s6
-; VI-SAFE-GISEL-NEXT: s_min_i32 s7, s7, 13
-; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s2, 12
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s2, s7
-; VI-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s7, s8, s7
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; VI-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s8, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s0
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s3, s0, 12
+; VI-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s3, s1, s3
+; VI-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
+; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s1, 12
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s1, s6
+; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s8, s6
+; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s1
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s1, s8, s1
+; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, s1, s3
+; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; VI-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; VI-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
+; VI-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-SAFE-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-SAFE-GISEL-NEXT: s_endpgm
;
; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-SDAG: ; %bb.0:
-; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-UNSAFE-SDAG-NEXT: s_endpgm
;
; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-GISEL: ; %bb.0:
-; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-UNSAFE-GISEL-NEXT: s_endpgm
;
; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-SDAG: ; %bb.0:
-; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
-; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
-; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
-; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
-; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s0, s7, 0x1ff
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s1, s7, 8
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s1, s1, 0xffe
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s2, 0x3f1, s0
+; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s0, 0xfc10
+; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s2, 0, 13
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s0, 12
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s1, s1, s2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s1, 0x1000
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s1, s6
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s2, s3
+; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s3
+; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 1
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s2, 7
+; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s3, 5
; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0
-; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s3, 3
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s3, -1, 0
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s2, 2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s3, s3, s6
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s3, 0
+; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s2, s2, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 31
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s1, -1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16
-; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000
; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0
; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-SAFE-SDAG-NEXT: s_endpgm
;
; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-GISEL: ; %bb.0:
-; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
+; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX10-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX10-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s3, 1, s0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s1, 0x1000
+; GFX10-SAFE-GISEL-NEXT: s_max_i32 s3, s3, 0
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s0, 12
+; GFX10-SAFE-GISEL-NEXT: s_min_i32 s3, s3, 13
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s3
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s3, s9, s3
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s3, s8
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s3, s9, s3
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s1, s3, s1
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX10-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-SAFE-GISEL-NEXT: s_endpgm
;
; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-SDAG: ; %bb.0:
-; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-UNSAFE-SDAG-NEXT: s_endpgm
;
; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-GISEL: ; %bb.0:
-; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-UNSAFE-GISEL-NEXT: s_endpgm
;
; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-SDAG: ; %bb.0:
-; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s0, s7, 0x1ff
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s1, s7, 8
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s6
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s1, s1, 0xffe
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
-; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
-; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s2, 0x3f1, s0
+; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s0, 0xfc10
+; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s2, 0, 13
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s0, 12
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s1, s1, s2
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s1, 0x1000
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s1, s6
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s2, s3
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s3
+; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 1
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, s6
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s2, 7
+; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s3, 5
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0
-; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s3, 3
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s3, s3, s6
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s2, s2, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 31
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16
-; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SAFE-SDAG-NEXT: s_nop 0
; GFX11-SAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SAFE-SDAG-NEXT: s_endpgm
;
; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-GISEL: ; %bb.0:
-; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
+; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX11-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX11-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s3, 1, s0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s1, 0x1000
+; GFX11-SAFE-GISEL-NEXT: s_max_i32 s3, s3, 0
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s0, 12
+; GFX11-SAFE-GISEL-NEXT: s_min_i32 s3, s3, 13
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s3
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s3, s9, s3
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s3, s8
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s3, s9, s3
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s1, s3, s1
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX11-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SAFE-GISEL-NEXT: s_nop 0
; GFX11-SAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SAFE-GISEL-NEXT: s_endpgm
;
; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-UNSAFE-SDAG: ; %bb.0:
-; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; GFX11-UNSAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-UNSAFE-SDAG-NEXT: s_nop 0
; GFX11-UNSAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-UNSAFE-SDAG-NEXT: s_endpgm
;
; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-UNSAFE-GISEL: ; %bb.0:
-; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-UNSAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-UNSAFE-GISEL-NEXT: s_nop 0
; GFX11-UNSAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-UNSAFE-GISEL-NEXT: s_endpgm
@@ -595,79 +593,79 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s11, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s10, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s10, -1
+; VI-GISEL-NEXT: s_mov_b32 s11, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -696,87 +694,89 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s11, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s10, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
-; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s14, -1
+; VI-GISEL-NEXT: s_mov_b32 s15, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[12:15], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x2
+; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x2
+; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -803,91 +803,91 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s15, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s14, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s14, -1
+; VI-GISEL-NEXT: s_mov_b32 s15, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -919,9 +919,9 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s23, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s22, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -931,16 +931,16 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s22, -1
+; VI-GISEL-NEXT: s_mov_b32 s23, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -950,17 +950,13 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -970,17 +966,17 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -990,17 +986,17 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -1010,20 +1006,20 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[4:7], 0 offset:16
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -1033,9 +1029,13 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[4:7], 0 offset:16
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 0d59021..c7e284d 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -93,12 +93,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -160,12 +160,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -185,12 +185,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
@@ -277,12 +277,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -335,12 +335,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -357,12 +357,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -446,12 +446,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -504,12 +504,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -526,12 +526,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -629,12 +629,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -720,12 +720,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
@@ -757,12 +757,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
@@ -853,12 +853,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -911,12 +911,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -933,12 +933,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1014,12 +1014,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -1072,12 +1072,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1094,12 +1094,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1218,12 +1218,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1303,12 +1303,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v12, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v12, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1338,12 +1338,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v12, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
@@ -1462,12 +1462,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1536,12 +1536,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v10, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1567,12 +1567,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v10, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1687,12 +1687,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1761,12 +1761,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v10, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1792,12 +1792,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v10, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1956,12 +1956,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -2053,12 +2053,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -2091,12 +2091,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(1)
; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX1150-NEXT: s_waitcnt vmcnt(0)
@@ -2346,11 +2346,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
@@ -2493,12 +2493,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -2553,12 +2553,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(1)
; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX1150-NEXT: s_waitcnt vmcnt(0)
@@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -2864,12 +2864,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@@ -2922,12 +2922,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@@ -3152,11 +3152,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s0, 64
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 64
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -3378,12 +3378,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
-; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
+; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:64
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
@@ -3478,12 +3478,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v8, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[6:7]
-; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
+; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:64
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
@@ -3731,11 +3731,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s0, 64
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 64
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -3859,12 +3859,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v16, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7]
-; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
+; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:64
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -3913,12 +3913,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v16, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[6:7]
-; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
+; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:64
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index fecf303..2e36b53 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -210,22 +210,22 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: s_not_b32 s3, s3
-; VI-NEXT: s_lshr_b32 s7, s5, 1
+; VI-NEXT: s_not_b32 s1, s3
+; VI-NEXT: s_lshr_b32 s0, s5, 1
; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_not_b32 s2, s2
+; VI-NEXT: s_not_b32 s1, s2
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: s_lshr_b32 s3, s4, 1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_lshr_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -293,18 +293,18 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1
; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1
-; GFX11-NEXT: s_lshr_b32 s5, s5, 1
-; GFX11-NEXT: s_not_b32 s3, s3
-; GFX11-NEXT: s_lshr_b32 s4, s4, 1
+; GFX11-NEXT: s_lshr_b32 s0, s5, 1
+; GFX11-NEXT: s_not_b32 s1, s3
+; GFX11-NEXT: s_lshr_b32 s3, s4, 1
; GFX11-NEXT: s_not_b32 s2, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_alignbit_b32 v1, s0, v0, s1
+; GFX11-NEXT: v_alignbit_b32 v0, s3, v3, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[8:9]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -332,14 +332,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-LABEL: fshl_v2i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23
; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -386,12 +386,12 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -441,34 +441,34 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
-; VI-NEXT: s_not_b32 s3, s15
-; VI-NEXT: s_lshr_b32 s2, s7, 1
+; VI-NEXT: s_not_b32 s1, s15
+; VI-NEXT: s_lshr_b32 s0, s7, 1
; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v3, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s10
-; VI-NEXT: s_not_b32 s3, s14
+; VI-NEXT: s_not_b32 s1, s14
; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s6, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1
+; VI-NEXT: s_lshr_b32 s0, s6, 1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: s_not_b32 s3, s13
+; VI-NEXT: s_not_b32 s1, s13
; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s5, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1
+; VI-NEXT: s_lshr_b32 s0, s5, 1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_not_b32 s3, s12
+; VI-NEXT: s_not_b32 s1, s12
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s4, 1
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_lshr_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_alignbit_b32 v0, s0, v0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -534,29 +534,29 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
;
; GFX10-LABEL: fshl_v4i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1
; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1
; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1
; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1
-; GFX10-NEXT: s_lshr_b32 s2, s7, 1
-; GFX10-NEXT: s_not_b32 s3, s15
+; GFX10-NEXT: s_lshr_b32 s0, s7, 1
+; GFX10-NEXT: s_not_b32 s1, s15
; GFX10-NEXT: s_lshr_b32 s6, s6, 1
; GFX10-NEXT: s_not_b32 s7, s14
; GFX10-NEXT: s_lshr_b32 s5, s5, 1
; GFX10-NEXT: s_not_b32 s9, s13
; GFX10-NEXT: s_lshr_b32 s4, s4, 1
; GFX10-NEXT: s_not_b32 s8, s12
-; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3
+; GFX10-NEXT: v_alignbit_b32 v3, s0, v0, s1
; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7
; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9
; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8
-; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_v4i32:
@@ -564,26 +564,26 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1
; GFX11-NEXT: v_alignbit_b32 v1, s6, s10, 1
; GFX11-NEXT: v_alignbit_b32 v5, s5, s9, 1
; GFX11-NEXT: v_alignbit_b32 v6, s4, s8, 1
-; GFX11-NEXT: s_lshr_b32 s2, s7, 1
-; GFX11-NEXT: s_not_b32 s3, s15
+; GFX11-NEXT: s_lshr_b32 s0, s7, 1
+; GFX11-NEXT: s_not_b32 s1, s15
; GFX11-NEXT: s_lshr_b32 s6, s6, 1
; GFX11-NEXT: s_not_b32 s7, s14
; GFX11-NEXT: s_lshr_b32 s5, s5, 1
; GFX11-NEXT: s_not_b32 s9, s13
; GFX11-NEXT: s_lshr_b32 s4, s4, 1
; GFX11-NEXT: s_not_b32 s8, s12
-; GFX11-NEXT: v_alignbit_b32 v3, s2, v0, s3
+; GFX11-NEXT: v_alignbit_b32 v3, s0, v0, s1
; GFX11-NEXT: v_alignbit_b32 v2, s6, v1, s7
; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9
; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -615,7 +615,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-LABEL: fshl_v4i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s10
@@ -624,17 +624,17 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23
; VI-NEXT: v_alignbit_b32 v1, s5, v4, 25
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
@@ -644,7 +644,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32_imm:
@@ -683,14 +683,14 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31
; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 23
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index a5ea1ee..860fe74 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -196,7 +196,7 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -204,8 +204,8 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -259,14 +259,14 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[8:9]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -294,14 +294,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-LABEL: fshr_v2i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9
; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -348,12 +348,12 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -391,7 +391,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s15
@@ -405,8 +405,8 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -414,8 +414,8 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s15
@@ -429,7 +429,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32:
@@ -474,7 +474,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14
@@ -485,7 +485,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -517,7 +517,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-LABEL: fshr_v4i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s10
@@ -526,17 +526,17 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9
; VI-NEXT: v_alignbit_b32 v1, s5, v4, 7
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
@@ -546,7 +546,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32_imm:
@@ -583,14 +583,14 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1
; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index f72d4e0..6de84a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -58,24 +58,24 @@ define amdgpu_kernel void @fsub_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -113,38 +113,38 @@ define amdgpu_kernel void @fsub_f16_imm_a(
;
; GFX89-LABEL: fsub_f16_imm_a:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,38 +180,38 @@ define amdgpu_kernel void @fsub_f16_imm_b(
;
; GFX89-LABEL: fsub_f16_imm_b:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,23 +309,23 @@ define amdgpu_kernel void @fsub_v2f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,60 +369,60 @@ define amdgpu_kernel void @fsub_v2f16_imm_a(
;
; VI-LABEL: fsub_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4000
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fsub_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x40003c00
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,60 +464,60 @@ define amdgpu_kernel void @fsub_v2f16_imm_b(
;
; VI-LABEL: fsub_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbc00
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, -2.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fsub_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0xbc00c000
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index 1853aa9..6d868e84 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -4,14 +4,14 @@
define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v0, v1, v0, v2
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -31,17 +31,17 @@ bb:
define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v1, v3, v1, v5
; GCN-NEXT: v_or3_b32 v0, v2, v0, v4
; GCN-NEXT: v_not_b32_e32 v1, v1
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -61,15 +61,15 @@ bb:
define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, v1, v0
; GCN-NEXT: v_and_b32_e32 v0, v0, v2
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -89,11 +89,11 @@ bb:
define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_and_b32_e32 v1, v3, v1
; GCN-NEXT: v_and_b32_e32 v0, v2, v0
@@ -102,7 +102,7 @@ define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
; GCN-NEXT: v_and_b32_e32 v0, v0, v4
; GCN-NEXT: v_not_b32_e32 v1, v1
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -122,14 +122,14 @@ bb:
define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v0, v1, v0
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v2
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -149,18 +149,18 @@ bb:
define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_xor_b32_e32 v1, v3, v1
; GCN-NEXT: v_xor_b32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
index 0612383..98bb405 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx803 -d - | FileCheck -check-prefix=DISASSEMBLY-VI %s
@@ -6,7 +7,7 @@
; FIXME: This will still fail for gfx6/7 and gfx10 subtargets.
; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9A-Z]+}}: DD348000
-; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9A-Z]+}}: 00000100
+; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v2, v0, v0, vcc // {{[0-9A-Z]+}}: 00040100
define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 {
; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
@@ -18,13 +19,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
+; GCN-NEXT: global_atomic_add_f32 v0, v1, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: .LBB0_2:
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index b8ecbae..d3dc660 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -135,10 +135,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT: v_mov_b32_e32 v2, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, s0
-; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX908-NEXT: s_endpgm
;
@@ -147,9 +147,9 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX90A-NEXT: s_endpgm
;
@@ -158,10 +158,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s0
-; GFX1030-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030-NEXT: s_endpgm
%gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index f709eae..41327f9 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -13,15 +13,15 @@
define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: test_move_load_address_to_vgpr:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc
+; GCN-NEXT: global_load_dword v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_add_u32_e32 v2, 0xffffff00, v0
; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GCN-NEXT: .LBB0_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -54,21 +54,21 @@ bb3: ; preds = %bb3, %bb
define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_movk_i32 s0, 0x100
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc
+; GCN-NEXT: global_load_ushort v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_movk_i32 s1, 0x100
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: .LBB1_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1]
-; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s0, v3
+; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
; GCN-NEXT: global_load_short_d16_hi v0, v[3:4], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; GCN-NEXT: s_cbranch_vccz .LBB1_1
; GCN-NEXT: ; %bb.2: ; %bb2
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 4d585cf..7653cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -19,13 +19,13 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_add_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -112,14 +112,14 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_add_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -394,13 +394,13 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_add_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -616,13 +616,13 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_and_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -846,13 +846,13 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_and_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1068,13 +1068,13 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_sub_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1298,13 +1298,13 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_sub_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1520,13 +1520,13 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_max_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1736,13 +1736,13 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_max_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i32:
@@ -1940,13 +1940,13 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_umax_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32_offset:
@@ -2152,13 +2152,13 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_umax_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32:
@@ -2356,13 +2356,13 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_min_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32_offset:
@@ -2568,13 +2568,13 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_min_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32:
@@ -2772,13 +2772,13 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_umin_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i32_offset:
@@ -2984,13 +2984,13 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_umin_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i32:
@@ -3190,13 +3190,13 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_or_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3420,13 +3420,13 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_or_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3642,13 +3642,13 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_xchg_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3686,13 +3686,13 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %
;
; VI-LABEL: atomic_xchg_f32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3916,13 +3916,13 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_xchg_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -4632,13 +4632,13 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_xor_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -4862,13 +4862,13 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_xor_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -5087,31 +5087,31 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %in, i64 4
@@ -5141,31 +5141,31 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_i32_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %in, i64 -128
@@ -5193,31 +5193,31 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_f32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f32_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr addrspace(1) %in, i64 4
@@ -5245,29 +5245,29 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1
;
; VI-LABEL: atomic_load_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -5298,14 +5298,12 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_i32_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -5313,9 +5311,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_addr64_offset:
@@ -5363,22 +5363,22 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i32_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_addr64:
@@ -5425,14 +5425,12 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_f32_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -5440,9 +5438,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f32_addr64_offset:
@@ -5796,29 +5796,29 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs
;
; VI-LABEL: atomic_load_i8_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i8_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(1) %in, i64 16
@@ -5848,31 +5848,31 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
;
; VI-LABEL: atomic_load_i8_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ubyte v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i8_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(1) %in, i64 -512
@@ -5977,29 +5977,29 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i16_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i16_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr addrspace(1) %in, i64 8
@@ -6029,31 +6029,31 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_i16_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i16_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr addrspace(1) %in, i64 -256
@@ -6307,13 +6307,13 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_inc_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6400,14 +6400,14 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_inc_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6681,13 +6681,13 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_dec_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6774,14 +6774,14 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_dec_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -7058,29 +7058,29 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_f16_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f16_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr half, ptr addrspace(1) %in, i64 8
%val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2
@@ -7109,31 +7109,31 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_f16_negoffset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f16_negoffset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr half, ptr addrspace(1) %in, i64 -256
%val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2
@@ -7160,29 +7160,29 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
;
; VI-LABEL: atomic_load_bf16_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_bf16_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8
%val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2
@@ -7211,31 +7211,31 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
;
; VI-LABEL: atomic_load_bf16_negoffset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_bf16_negoffset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256
%val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 3050da03..b8031c6 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4753,26 +4753,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB92_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_i32_e32 v2, s4, v3
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4782,8 +4782,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_cbranch_execnz .LBB92_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4963,24 +4963,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB94_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_i32_e32 v2, s4, v3
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4990,8 +4990,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB94_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -6006,26 +6006,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB106_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_u32_e32 v2, s4, v3
+; VI-NEXT: v_max_u32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6035,8 +6035,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; VI-NEXT: s_cbranch_execnz .LBB106_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -6121,24 +6121,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB107_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_u32_e32 v2, s4, v3
+; VI-NEXT: v_max_u32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6148,8 +6148,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB107_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -7997,26 +7997,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB129_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_min_i32_e32 v2, s4, v3
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8026,8 +8026,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_cbranch_execnz .LBB129_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -8194,24 +8194,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB131_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_min_i32_e32 v2, s4, v3
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8221,8 +8221,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB131_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index f5dbaaf..a6c8f66 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -20,36 +20,36 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_add_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -115,15 +115,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_add_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -155,10 +155,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_add_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -191,11 +191,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
@@ -233,56 +233,56 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_add_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -312,38 +312,38 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_add_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -408,15 +408,15 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_add_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -447,10 +447,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_add_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -481,11 +481,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1]
@@ -522,54 +522,54 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_add_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -596,36 +596,36 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_and_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -691,15 +691,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_and_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -731,10 +731,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_and_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -767,11 +767,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
@@ -809,56 +809,56 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_and_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -888,38 +888,38 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_and_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -984,15 +984,15 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_and_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1023,10 +1023,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_and_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -1057,11 +1057,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1]
@@ -1098,54 +1098,54 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_and_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1172,36 +1172,36 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_sub_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -1267,15 +1267,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_sub_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1307,10 +1307,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_sub_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -1343,11 +1343,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
@@ -1385,56 +1385,56 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_sub_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1464,38 +1464,38 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_sub_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -1560,15 +1560,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_sub_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1599,10 +1599,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_sub_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -1633,11 +1633,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1]
@@ -1674,54 +1674,54 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_sub_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1746,32 +1746,32 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_max_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -1834,15 +1834,15 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_max_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1872,10 +1872,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_max_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -1904,11 +1904,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
@@ -1945,54 +1945,54 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2020,34 +2020,34 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_max_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2109,15 +2109,15 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_max_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2146,10 +2146,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_max_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2176,11 +2176,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1]
@@ -2216,52 +2216,52 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2286,32 +2286,32 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_umax_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2374,15 +2374,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_umax_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2412,10 +2412,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_umax_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -2444,11 +2444,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
@@ -2485,54 +2485,54 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2560,34 +2560,34 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_umax_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2649,15 +2649,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_umax_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2686,10 +2686,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_umax_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2716,11 +2716,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1]
@@ -2756,52 +2756,52 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2826,32 +2826,32 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_min_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2914,15 +2914,15 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_min_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2952,10 +2952,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_min_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -2984,11 +2984,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
@@ -3025,54 +3025,54 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3100,34 +3100,34 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_min_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3189,15 +3189,15 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_min_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3226,10 +3226,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_min_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -3256,11 +3256,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1]
@@ -3296,52 +3296,52 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3366,32 +3366,32 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_umin_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3454,15 +3454,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_umin_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3492,10 +3492,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_umin_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -3524,11 +3524,11 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
@@ -3565,54 +3565,54 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umin_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3640,34 +3640,34 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_umin_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3729,15 +3729,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_umin_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3766,10 +3766,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_umin_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -3796,11 +3796,11 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1]
@@ -3836,52 +3836,52 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umin_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3908,36 +3908,36 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_or_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4003,15 +4003,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
; GFX12-LABEL: atomic_or_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4043,10 +4043,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; VI-LABEL: atomic_or_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -4079,11 +4079,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
@@ -4121,56 +4121,56 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
;
; VI-LABEL: atomic_or_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4200,38 +4200,38 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_or_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4296,15 +4296,15 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
; GFX12-LABEL: atomic_or_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4335,10 +4335,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_or_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -4369,11 +4369,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1]
@@ -4410,54 +4410,54 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: atomic_or_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4484,36 +4484,36 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_xchg_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4539,36 +4539,36 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
;
; VI-LABEL: atomic_xchg_f64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_f64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4594,36 +4594,36 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xchg_pointer_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_pointer_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_pointer_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4689,15 +4689,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_xchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4729,10 +4729,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_xchg_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -4765,11 +4765,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
@@ -4807,56 +4807,56 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_xchg_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4886,38 +4886,38 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_xchg_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4982,15 +4982,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_xchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5021,10 +5021,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_xchg_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -5055,11 +5055,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1]
@@ -5096,54 +5096,54 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xchg_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5170,36 +5170,36 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_xor_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -5265,15 +5265,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_xor_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5305,10 +5305,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_xor_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -5341,11 +5341,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
@@ -5383,56 +5383,56 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_xor_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5462,38 +5462,38 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_xor_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -5558,15 +5558,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_xor_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5597,10 +5597,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_xor_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -5631,11 +5631,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1]
@@ -5672,54 +5672,54 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xor_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5786,11 +5786,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5859,11 +5859,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5897,50 +5897,50 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
;
; VI-LABEL: atomic_cmpxchg_i64_ret_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5974,18 +5974,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_cmpxchg_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -5994,16 +5994,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6011,14 +6011,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6058,19 +6058,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s4, s2
-; VI-NEXT: s_addc_u32 s3, s5, s3
-; VI-NEXT: s_add_u32 s2, s0, 32
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
+; VI-NEXT: s_add_u32 s0, s0, 32
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6084,17 +6084,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GFX9-NEXT: s_add_u32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_addc_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] offset:32 glc
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
@@ -6104,11 +6104,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
@@ -6184,11 +6184,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6221,50 +6221,50 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: atomic_cmpxchg_i64_ret:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_cmpxchg_i64_ret:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6297,16 +6297,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; VI-LABEL: atomic_cmpxchg_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6315,16 +6315,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; GFX9-LABEL: atomic_cmpxchg_i64_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6332,14 +6332,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6378,17 +6378,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; VI-LABEL: atomic_cmpxchg_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; VI-NEXT: s_add_u32 s2, s4, s2
-; VI-NEXT: s_addc_u32 s3, s5, s3
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6402,17 +6402,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GFX9-NEXT: s_add_u32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_addc_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
@@ -6422,11 +6422,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
@@ -6464,42 +6464,42 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6531,42 +6531,42 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
;
; VI-LABEL: atomic_load_i64_neg_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xffffffe0
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xffffffe0
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_neg_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:-32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_neg_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] offset:-32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6596,40 +6596,40 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
;
; VI-LABEL: atomic_load_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6662,14 +6662,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -6677,9 +6675,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_addr64_offset:
@@ -6700,17 +6700,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
; GFX12-LABEL: atomic_load_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6745,22 +6745,22 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_addr64:
@@ -6781,17 +6781,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
; GFX12-LABEL: atomic_load_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6825,14 +6825,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_f64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -6840,9 +6838,11 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f64_addr64_offset:
@@ -6863,17 +6863,17 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
; GFX12-LABEL: atomic_load_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6901,34 +6901,34 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
;
; VI-LABEL: atomic_store_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s2, 32
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_add_u32 s0, s6, 32
+; VI-NEXT: s_addc_u32 s1, s7, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_store_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6954,32 +6954,32 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
;
; VI-LABEL: atomic_store_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_store_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7008,10 +7008,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
; VI-LABEL: atomic_store_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7040,11 +7040,11 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32
@@ -7078,10 +7078,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
; VI-LABEL: atomic_store_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -7108,11 +7108,11 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -7145,10 +7145,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
; VI-LABEL: atomic_store_f64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7177,11 +7177,11 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32
@@ -7211,36 +7211,36 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_inc_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_inc_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -7306,15 +7306,15 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_inc_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7346,10 +7346,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_inc_i64_incr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7382,11 +7382,11 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
@@ -7416,36 +7416,36 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_dec_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_dec_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -7511,15 +7511,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_dec_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7551,10 +7551,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_dec_i64_decr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7587,11 +7587,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index cafd35a..200aa19 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -4905,26 +4905,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
;
; VI-LABEL: atomic_max_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB88_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4932,9 +4932,9 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB88_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -5025,76 +5025,76 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB89_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB89_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB89_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5146,24 +5146,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
;
; VI-LABEL: atomic_max_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s4, s0, s4
-; VI-NEXT: s_addc_u32 s5, s1, s5
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v6, s3
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB90_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -5263,25 +5263,25 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB91_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5293,44 +5293,44 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB91_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB91_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6367,26 +6367,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
;
; VI-LABEL: atomic_umax_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB102_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -6394,9 +6394,9 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB102_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -6487,76 +6487,76 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB103_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB103_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB103_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6613,25 +6613,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB104_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -6643,44 +6643,44 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB104_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB104_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8703,26 +8703,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
;
; VI-LABEL: atomic_min_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB125_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8730,9 +8730,9 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB125_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -8823,76 +8823,76 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB126_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB126_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB126_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8942,20 +8942,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_min_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: .LBB127_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8963,38 +8963,38 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB127_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB127_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
@@ -9050,25 +9050,25 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB128_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -9080,44 +9080,44 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB128_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB128_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 6555ceb..9d174be 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -54,95 +54,95 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -157,14 +157,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1164-NEXT: .LBB0_2:
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -179,13 +179,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1132-NEXT: .LBB0_2:
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -193,95 +193,95 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -296,14 +296,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1164-DPP-NEXT: .LBB0_2:
; GFX1164-DPP-NEXT: s_nop 0
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -318,13 +318,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1132-DPP-NEXT: .LBB0_2:
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1119,11 +1119,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -1131,12 +1131,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1158,64 +1158,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1238,27 +1238,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1269,8 +1269,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -1280,25 +1280,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1323,11 +1323,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -1335,12 +1335,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1362,64 +1362,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1442,27 +1442,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1473,8 +1473,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -1484,25 +1484,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2349,11 +2349,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -2361,12 +2361,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2388,64 +2388,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2499,8 +2499,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -2510,25 +2510,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2553,11 +2553,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -2565,12 +2565,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2592,64 +2592,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2672,27 +2672,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2703,8 +2703,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -2714,25 +2714,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4247,11 +4247,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -4259,12 +4259,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
@@ -4286,64 +4286,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
@@ -4366,27 +4366,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
@@ -4397,8 +4397,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -4408,25 +4408,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
@@ -4451,11 +4451,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -4463,12 +4463,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4490,64 +4490,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4570,27 +4570,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4601,8 +4601,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -4612,25 +4612,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5452,101 +5452,101 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
@@ -5562,165 +5562,165 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5736,64 +5736,64 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6349,11 +6349,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -6361,13 +6361,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
@@ -6387,68 +6387,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
@@ -6471,28 +6471,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
@@ -6503,8 +6503,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -6514,25 +6514,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
@@ -6557,11 +6557,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -6569,13 +6569,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6595,68 +6595,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6679,28 +6679,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6711,8 +6711,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -6722,25 +6722,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -7296,11 +7296,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -7308,13 +7308,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -7334,68 +7334,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -7418,28 +7418,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -7450,8 +7450,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -7461,25 +7461,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -7504,11 +7504,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -7516,13 +7516,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -7542,68 +7542,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -7626,28 +7626,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -7658,8 +7658,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -7669,25 +7669,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -8721,11 +8721,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -8733,13 +8733,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
@@ -8759,68 +8759,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
@@ -8843,28 +8843,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
; GFX1164-NEXT: s_endpgm
@@ -8875,8 +8875,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -8886,25 +8886,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
@@ -8929,11 +8929,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -8941,13 +8941,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -8967,68 +8967,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -9051,28 +9051,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -9083,8 +9083,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -9094,25 +9094,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -9637,330 +9637,330 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB18_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB18_2
; GFX9-NEXT: .LBB18_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB18_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB18_2
; GFX1064-NEXT: .LBB18_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB18_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB18_2
; GFX1032-NEXT: .LBB18_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB18_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB18_2
; GFX1164-NEXT: .LBB18_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB18_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB18_2
; GFX1132-NEXT: .LBB18_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX9-DPP-NEXT: .LBB18_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1064-DPP-NEXT: .LBB18_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1032-DPP-NEXT: .LBB18_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1164-DPP-NEXT: .LBB18_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1132-DPP-NEXT: .LBB18_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -10007,330 +10007,330 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB19_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB19_2
; GFX9-NEXT: .LBB19_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB19_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB19_2
; GFX1064-NEXT: .LBB19_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB19_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB19_2
; GFX1032-NEXT: .LBB19_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB19_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB19_2
; GFX1164-NEXT: .LBB19_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB19_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB19_2
; GFX1132-NEXT: .LBB19_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX9-DPP-NEXT: .LBB19_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1064-DPP-NEXT: .LBB19_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1032-DPP-NEXT: .LBB19_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1164-DPP-NEXT: .LBB19_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1132-DPP-NEXT: .LBB19_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6548792..fdb36b3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -57,23 +57,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
@@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
@@ -110,27 +110,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -144,25 +144,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
@@ -170,30 +170,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
@@ -206,23 +206,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -235,23 +235,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -259,27 +259,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -293,25 +293,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -319,30 +319,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1235,23 +1235,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1264,23 +1264,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
@@ -1288,27 +1288,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1322,25 +1322,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1348,30 +1348,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1384,23 +1384,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1413,23 +1413,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -1437,27 +1437,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1471,25 +1471,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1497,30 +1497,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2415,23 +2415,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2444,23 +2444,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2502,25 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2528,30 +2528,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2564,23 +2564,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2593,23 +2593,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -2617,27 +2617,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2651,25 +2651,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2677,30 +2677,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -3597,11 +3597,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -3609,13 +3609,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
@@ -3628,25 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
@@ -3654,29 +3654,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
@@ -3690,27 +3690,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
; GFX1164-NEXT: s_endpgm
@@ -3718,30 +3718,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
@@ -3754,11 +3754,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -3766,13 +3766,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -3785,25 +3785,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -3811,29 +3811,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -3847,27 +3847,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -3875,30 +3875,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4456,11 +4456,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -4468,13 +4468,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-NEXT: .LBB8_3:
; GFX9-NEXT: s_endpgm
@@ -4487,25 +4487,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-NEXT: .LBB8_3:
; GFX1064-NEXT: s_endpgm
@@ -4513,29 +4513,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-NEXT: .LBB8_3:
; GFX1032-NEXT: s_endpgm
@@ -4549,27 +4549,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-NEXT: .LBB8_3:
; GFX1164-NEXT: s_endpgm
@@ -4577,30 +4577,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
@@ -4613,11 +4613,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -4625,13 +4625,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4644,25 +4644,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -4670,29 +4670,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4706,27 +4706,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4734,30 +4734,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5315,11 +5315,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5327,13 +5327,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
@@ -5346,25 +5346,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
@@ -5372,29 +5372,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
@@ -5408,27 +5408,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
; GFX1164-NEXT: s_endpgm
@@ -5436,30 +5436,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
@@ -5472,11 +5472,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -5484,13 +5484,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -5503,25 +5503,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -5529,29 +5529,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5565,27 +5565,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -5593,30 +5593,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6170,23 +6170,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
@@ -6199,23 +6199,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-NEXT: .LBB12_3:
; GFX1064-NEXT: s_endpgm
@@ -6223,27 +6223,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-NEXT: .LBB12_3:
; GFX1032-NEXT: s_endpgm
@@ -6257,25 +6257,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-NEXT: .LBB12_3:
; GFX1164-NEXT: s_endpgm
@@ -6283,30 +6283,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
@@ -6319,23 +6319,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6348,23 +6348,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6372,27 +6372,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6406,25 +6406,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6432,30 +6432,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6505,23 +6505,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -6534,23 +6534,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
@@ -6558,27 +6558,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -6592,25 +6592,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -6618,30 +6618,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -6654,23 +6654,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6683,23 +6683,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6707,27 +6707,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6741,25 +6741,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6767,30 +6767,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 6936cdc..d47a424 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -57,23 +57,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
@@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
@@ -110,27 +110,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -144,25 +144,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
@@ -170,30 +170,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
@@ -206,23 +206,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -235,23 +235,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -259,27 +259,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -293,25 +293,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -319,30 +319,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1235,23 +1235,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1264,23 +1264,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
@@ -1288,27 +1288,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1322,25 +1322,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1348,30 +1348,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1384,23 +1384,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1413,23 +1413,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -1437,27 +1437,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1471,25 +1471,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1497,30 +1497,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2415,23 +2415,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2444,23 +2444,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2502,25 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2528,30 +2528,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2564,23 +2564,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2593,23 +2593,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -2617,27 +2617,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2651,25 +2651,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2677,30 +2677,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -3597,11 +3597,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -3609,13 +3609,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
@@ -3628,25 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
@@ -3654,29 +3654,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
@@ -3690,27 +3690,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
; GFX1164-NEXT: s_endpgm
@@ -3718,30 +3718,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
@@ -3754,11 +3754,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -3766,13 +3766,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -3785,25 +3785,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -3811,29 +3811,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -3847,27 +3847,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -3875,30 +3875,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4456,11 +4456,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -4468,13 +4468,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-NEXT: .LBB8_3:
; GFX9-NEXT: s_endpgm
@@ -4487,25 +4487,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-NEXT: .LBB8_3:
; GFX1064-NEXT: s_endpgm
@@ -4513,29 +4513,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-NEXT: .LBB8_3:
; GFX1032-NEXT: s_endpgm
@@ -4549,27 +4549,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-NEXT: .LBB8_3:
; GFX1164-NEXT: s_endpgm
@@ -4577,30 +4577,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
@@ -4613,11 +4613,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -4625,13 +4625,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4644,25 +4644,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -4670,29 +4670,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4706,27 +4706,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4734,30 +4734,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5315,11 +5315,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5327,13 +5327,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
@@ -5346,25 +5346,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
@@ -5372,29 +5372,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
@@ -5408,27 +5408,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
; GFX1164-NEXT: s_endpgm
@@ -5436,30 +5436,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
@@ -5472,11 +5472,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -5484,13 +5484,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -5503,25 +5503,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -5529,29 +5529,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5565,27 +5565,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -5593,30 +5593,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6170,23 +6170,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
@@ -6199,23 +6199,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-NEXT: .LBB12_3:
; GFX1064-NEXT: s_endpgm
@@ -6223,27 +6223,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-NEXT: .LBB12_3:
; GFX1032-NEXT: s_endpgm
@@ -6257,25 +6257,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-NEXT: .LBB12_3:
; GFX1164-NEXT: s_endpgm
@@ -6283,30 +6283,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
@@ -6319,23 +6319,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6348,23 +6348,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6372,27 +6372,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6406,25 +6406,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6432,30 +6432,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6505,23 +6505,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -6534,23 +6534,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
@@ -6558,27 +6558,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -6592,25 +6592,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -6618,30 +6618,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -6654,23 +6654,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6683,23 +6683,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6707,27 +6707,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6741,25 +6741,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6767,30 +6767,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5cb5770..1d251f9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -54,330 +54,330 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1223,11 +1223,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1262,64 +1262,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1342,27 +1342,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1373,8 +1373,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -1384,25 +1384,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1427,11 +1427,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -1439,12 +1439,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1466,64 +1466,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1546,27 +1546,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1577,8 +1577,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -1588,25 +1588,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2453,11 +2453,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -2465,12 +2465,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2492,64 +2492,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2572,27 +2572,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2603,8 +2603,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -2614,25 +2614,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2657,11 +2657,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -2669,12 +2669,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2696,64 +2696,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2776,27 +2776,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2807,8 +2807,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -2818,25 +2818,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4455,11 +4455,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -4467,12 +4467,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
@@ -4494,64 +4494,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
@@ -4574,27 +4574,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
@@ -4605,8 +4605,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -4616,25 +4616,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
@@ -4659,11 +4659,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -4671,12 +4671,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4698,64 +4698,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4778,27 +4778,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4809,8 +4809,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -4820,25 +4820,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5660,101 +5660,101 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
@@ -5770,165 +5770,165 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5944,64 +5944,64 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6557,11 +6557,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -6569,13 +6569,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
@@ -6595,68 +6595,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
@@ -6679,28 +6679,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
@@ -6711,8 +6711,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -6722,25 +6722,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
@@ -6765,11 +6765,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -6777,13 +6777,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6803,68 +6803,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6887,28 +6887,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6919,8 +6919,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -6930,25 +6930,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -7503,11 +7503,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -7515,13 +7515,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -7541,68 +7541,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -7625,28 +7625,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -7657,8 +7657,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -7668,25 +7668,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -7711,11 +7711,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -7723,13 +7723,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -7749,68 +7749,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -7833,28 +7833,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -7865,8 +7865,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -7876,25 +7876,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -8927,11 +8927,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -8939,13 +8939,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
@@ -8965,68 +8965,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
@@ -9049,28 +9049,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
; GFX1164-NEXT: s_endpgm
@@ -9081,8 +9081,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -9092,25 +9092,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
@@ -9135,11 +9135,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -9147,13 +9147,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -9173,68 +9173,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -9257,28 +9257,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -9289,8 +9289,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -9300,25 +9300,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 297b518..5abd4c9 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -35,8 +35,8 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT: s_movk_i32 s4, 0x130
-; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: s_movk_i32 s20, 0x130
+; CHECK-NEXT: s_mov_b32 s21, s24
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v4, s36, 0
; CHECK-NEXT: v_writelane_b32 v4, s37, 1
@@ -49,7 +49,7 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v4, s44, 8
; CHECK-NEXT: v_writelane_b32 v4, s45, 9
; CHECK-NEXT: v_writelane_b32 v4, s46, 10
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0
; CHECK-NEXT: v_writelane_b32 v4, s47, 11
; CHECK-NEXT: v_writelane_b32 v4, s48, 12
; CHECK-NEXT: v_writelane_b32 v4, s49, 13
@@ -78,17 +78,17 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v4, s13, 25
; CHECK-NEXT: v_writelane_b32 v4, s14, 26
; CHECK-NEXT: v_writelane_b32 v4, s15, 27
-; CHECK-NEXT: v_writelane_b32 v4, s16, 28
; CHECK-NEXT: v_writelane_b32 v8, s52, 18
-; CHECK-NEXT: v_writelane_b32 v4, s17, 29
+; CHECK-NEXT: v_writelane_b32 v4, s16, 28
; CHECK-NEXT: v_writelane_b32 v8, s53, 19
-; CHECK-NEXT: v_writelane_b32 v4, s18, 30
+; CHECK-NEXT: v_writelane_b32 v4, s17, 29
; CHECK-NEXT: v_writelane_b32 v8, s54, 20
-; CHECK-NEXT: v_writelane_b32 v4, s19, 31
-; CHECK-NEXT: s_mov_b32 s4, 48
-; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: v_writelane_b32 v4, s18, 30
+; CHECK-NEXT: s_mov_b32 s26, 48
+; CHECK-NEXT: s_mov_b32 s27, s24
; CHECK-NEXT: v_writelane_b32 v8, s55, 21
-; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v4, s19, 31
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0
; CHECK-NEXT: v_writelane_b32 v8, s56, 22
; CHECK-NEXT: v_writelane_b32 v8, s57, 23
; CHECK-NEXT: v_writelane_b32 v8, s58, 24
@@ -107,15 +107,15 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v8, s65, 31
; CHECK-NEXT: v_writelane_b32 v4, s9, 37
; CHECK-NEXT: v_writelane_b32 v8, s66, 32
-; CHECK-NEXT: s_movk_i32 s26, 0x1f0
-; CHECK-NEXT: s_movk_i32 s28, 0x2f0
-; CHECK-NEXT: s_mov_b32 s27, s24
+; CHECK-NEXT: s_movk_i32 s28, 0x1f0
+; CHECK-NEXT: s_movk_i32 s30, 0x2f0
; CHECK-NEXT: s_mov_b32 s29, s24
+; CHECK-NEXT: s_mov_b32 s31, s24
; CHECK-NEXT: v_writelane_b32 v4, s10, 38
; CHECK-NEXT: v_writelane_b32 v8, s67, 33
; CHECK-NEXT: v_writelane_b32 v4, s11, 39
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 7ee31bf..c6342e5 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -7,9 +7,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: udiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
@@ -36,15 +36,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: s_add_i32 s10, s11, 1
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s11
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
+; GFX9-NEXT: s_add_u32 s10, s2, s0
+; GFX9-NEXT: s_addc_u32 s11, s3, s1
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_add_u32 s0, s0, 4
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
; GFX9-NEXT: ; %bb.2: ; %bb2
@@ -52,20 +52,21 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
;
; GFX10-LABEL: udiv32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s0, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
+; GFX10-NEXT: s_mul_i32 s0, s0, s4
+; GFX10-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB0_1: ; %bb3
@@ -83,15 +84,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_add_i32 s10, s11, 1
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s11
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
+; GFX10-NEXT: s_add_u32 s10, s2, s0
+; GFX10-NEXT: s_addc_u32 s11, s3, s1
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX10-NEXT: s_add_u32 s0, s0, 4
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %bb2
@@ -101,11 +102,11 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s0, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -114,10 +115,10 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
+; GFX11-NEXT: s_mul_i32 s0, s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
+; GFX11-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: .p2align 6
@@ -136,15 +137,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_add_i32 s10, s11, 1
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s11
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
+; GFX11-NEXT: s_add_u32 s10, s2, s0
+; GFX11-NEXT: s_addc_u32 s11, s3, s1
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX11-NEXT: s_add_u32 s0, s0, 4
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
; GFX11-NEXT: ; %bb.2: ; %bb2
@@ -172,9 +173,9 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: urem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
@@ -199,15 +200,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: s_sub_i32 s10, s9, s6
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
+; GFX9-NEXT: s_add_u32 s10, s2, s0
+; GFX9-NEXT: s_addc_u32 s11, s3, s1
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_add_u32 s0, s0, 4
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %bb2
@@ -215,20 +216,21 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
;
; GFX10-LABEL: urem32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s0, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
+; GFX10-NEXT: s_mul_i32 s0, s0, s4
+; GFX10-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB1_1: ; %bb3
@@ -244,15 +246,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_sub_i32 s10, s9, s6
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s9
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
+; GFX10-NEXT: s_add_u32 s10, s2, s0
+; GFX10-NEXT: s_addc_u32 s11, s3, s1
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX10-NEXT: s_add_u32 s0, s0, 4
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
; GFX10-NEXT: ; %bb.2: ; %bb2
@@ -262,11 +264,11 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s0, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -275,10 +277,10 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
+; GFX11-NEXT: s_mul_i32 s0, s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
+; GFX11-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: .p2align 6
@@ -296,15 +298,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_sub_i32 s10, s9, s6
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s9
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
+; GFX11-NEXT: s_add_u32 s10, s2, s0
+; GFX11-NEXT: s_addc_u32 s11, s3, s1
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX11-NEXT: s_add_u32 s0, s0, 4
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
; GFX11-NEXT: ; %bb.2: ; %bb2
@@ -331,14 +333,14 @@ bb3: ; preds = %bb3, %bb
define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GFX9-LABEL: sdiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_abs_i32 s2, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s5, 0, s2
-; GFX9-NEXT: s_ashr_i32 s4, s4, 31
+; GFX9-NEXT: s_abs_i32 s4, s5
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_ashr_i32 s1, s5, 31
+; GFX9-NEXT: s_sub_i32 s5, 0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -349,70 +351,70 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB2_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5
-; GFX9-NEXT: s_mul_i32 s7, s6, s2
-; GFX9-NEXT: s_sub_i32 s7, s3, s7
+; GFX9-NEXT: s_mul_hi_u32 s6, s0, s5
+; GFX9-NEXT: s_mul_i32 s7, s6, s4
+; GFX9-NEXT: s_sub_i32 s7, s0, s7
; GFX9-NEXT: s_add_i32 s8, s6, 1
-; GFX9-NEXT: s_sub_i32 s9, s7, s2
-; GFX9-NEXT: s_cmp_ge_u32 s7, s2
+; GFX9-NEXT: s_sub_i32 s9, s7, s4
+; GFX9-NEXT: s_cmp_ge_u32 s7, s4
; GFX9-NEXT: s_cselect_b32 s6, s8, s6
; GFX9-NEXT: s_cselect_b32 s7, s9, s7
; GFX9-NEXT: s_add_i32 s8, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s7, s2
+; GFX9-NEXT: s_cmp_ge_u32 s7, s4
; GFX9-NEXT: s_cselect_b32 s6, s8, s6
-; GFX9-NEXT: s_xor_b32 s6, s6, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s4
-; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: s_xor_b32 s6, s6, s1
+; GFX9-NEXT: s_sub_i32 s6, s6, s1
+; GFX9-NEXT: s_add_i32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_cbranch_scc0 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdiv32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_abs_i32 s2, s3
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT: s_sub_i32 s4, 0, s2
-; GFX10-NEXT: s_ashr_i32 s3, s3, 31
+; GFX10-NEXT: s_abs_i32 s4, s5
+; GFX10-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT: s_sub_i32 s1, 0, s4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s5, v0
+; GFX10-NEXT: v_readfirstlane_b32 s6, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s4, s4, s5
-; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: s_add_i32 s5, s5, s6
+; GFX10-NEXT: s_mul_i32 s1, s1, s6
+; GFX10-NEXT: s_mul_hi_u32 s5, s6, s1
+; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_add_i32 s5, s6, s5
; GFX10-NEXT: .LBB2_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5
-; GFX10-NEXT: s_mul_i32 s7, s6, s2
+; GFX10-NEXT: s_mul_hi_u32 s6, s1, s5
+; GFX10-NEXT: s_mul_i32 s7, s6, s4
; GFX10-NEXT: s_add_i32 s8, s6, 1
-; GFX10-NEXT: s_sub_i32 s7, s4, s7
-; GFX10-NEXT: s_sub_i32 s9, s7, s2
-; GFX10-NEXT: s_cmp_ge_u32 s7, s2
+; GFX10-NEXT: s_sub_i32 s7, s1, s7
+; GFX10-NEXT: s_sub_i32 s9, s7, s4
+; GFX10-NEXT: s_cmp_ge_u32 s7, s4
; GFX10-NEXT: s_cselect_b32 s6, s8, s6
; GFX10-NEXT: s_cselect_b32 s7, s9, s7
; GFX10-NEXT: s_add_i32 s8, s6, 1
-; GFX10-NEXT: s_cmp_ge_u32 s7, s2
+; GFX10-NEXT: s_cmp_ge_u32 s7, s4
; GFX10-NEXT: s_cselect_b32 s6, s8, s6
-; GFX10-NEXT: s_add_i32 s4, s4, 1
-; GFX10-NEXT: s_xor_b32 s6, s6, s3
-; GFX10-NEXT: s_sub_i32 s6, s6, s3
+; GFX10-NEXT: s_add_i32 s1, s1, 1
+; GFX10-NEXT: s_xor_b32 s6, s6, s0
+; GFX10-NEXT: s_sub_i32 s6, s6, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s1, 0x400
; GFX10-NEXT: s_cbranch_scc0 .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -420,51 +422,51 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: sdiv32_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_abs_i32 s2, s3
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s4, 0, s2
+; GFX11-NEXT: s_abs_i32 s4, s5
+; GFX11-NEXT: s_ashr_i32 s0, s5, 31
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX11-NEXT: s_sub_i32 s1, 0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: v_readfirstlane_b32 s6, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s4, s4, s5
+; GFX11-NEXT: s_mul_i32 s1, s1, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_add_i32 s5, s5, s6
+; GFX11-NEXT: s_mul_hi_u32 s5, s6, s1
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_add_i32 s5, s6, s5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB2_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5
-; GFX11-NEXT: s_mul_i32 s7, s6, s2
+; GFX11-NEXT: s_mul_hi_u32 s6, s1, s5
+; GFX11-NEXT: s_mul_i32 s7, s6, s4
; GFX11-NEXT: s_add_i32 s8, s6, 1
-; GFX11-NEXT: s_sub_i32 s7, s4, s7
+; GFX11-NEXT: s_sub_i32 s7, s1, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s9, s7, s2
-; GFX11-NEXT: s_cmp_ge_u32 s7, s2
+; GFX11-NEXT: s_sub_i32 s9, s7, s4
+; GFX11-NEXT: s_cmp_ge_u32 s7, s4
; GFX11-NEXT: s_cselect_b32 s6, s8, s6
; GFX11-NEXT: s_cselect_b32 s7, s9, s7
; GFX11-NEXT: s_add_i32 s8, s6, 1
-; GFX11-NEXT: s_cmp_ge_u32 s7, s2
+; GFX11-NEXT: s_cmp_ge_u32 s7, s4
; GFX11-NEXT: s_cselect_b32 s6, s8, s6
-; GFX11-NEXT: s_add_i32 s4, s4, 1
-; GFX11-NEXT: s_xor_b32 s6, s6, s3
+; GFX11-NEXT: s_add_i32 s1, s1, 1
+; GFX11-NEXT: s_xor_b32 s6, s6, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s6, s6, s3
+; GFX11-NEXT: s_sub_i32 s6, s6, s0
; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB2_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -491,37 +493,38 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: srem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_abs_i32 s2, s2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
+; GFX9-NEXT: s_abs_i32 s4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_sub_i32 s1, 0, s4
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s1, s1, s5
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s1
+; GFX9-NEXT: s_add_i32 s1, s5, s1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB3_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX9-NEXT: s_mul_i32 s5, s5, s2
-; GFX9-NEXT: s_sub_i32 s5, s3, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
+; GFX9-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX9-NEXT: s_mul_i32 s5, s5, s4
+; GFX9-NEXT: s_sub_i32 s5, s0, s5
+; GFX9-NEXT: s_sub_i32 s6, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
+; GFX9-NEXT: s_sub_i32 s6, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: s_add_i32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_cbranch_scc0 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -530,85 +533,85 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_abs_i32 s2, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT: s_sub_i32 s3, 0, s2
+; GFX10-NEXT: s_abs_i32 s4, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT: s_sub_i32 s0, 0, s4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s3, s3, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3
-; GFX10-NEXT: s_mov_b32 s3, 0
-; GFX10-NEXT: s_add_i32 s4, s4, s5
+; GFX10-NEXT: s_mul_i32 s0, s0, s1
+; GFX10-NEXT: s_mul_hi_u32 s5, s1, s0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_add_i32 s1, s1, s5
; GFX10-NEXT: .LBB3_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX10-NEXT: s_mul_i32 s5, s5, s2
-; GFX10-NEXT: s_sub_i32 s5, s3, s5
-; GFX10-NEXT: s_sub_i32 s6, s5, s2
-; GFX10-NEXT: s_cmp_ge_u32 s5, s2
+; GFX10-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX10-NEXT: s_mul_i32 s5, s5, s4
+; GFX10-NEXT: s_sub_i32 s5, s0, s5
+; GFX10-NEXT: s_sub_i32 s6, s5, s4
+; GFX10-NEXT: s_cmp_ge_u32 s5, s4
; GFX10-NEXT: s_cselect_b32 s5, s6, s5
-; GFX10-NEXT: s_sub_i32 s6, s5, s2
-; GFX10-NEXT: s_cmp_ge_u32 s5, s2
+; GFX10-NEXT: s_sub_i32 s6, s5, s4
+; GFX10-NEXT: s_cmp_ge_u32 s5, s4
; GFX10-NEXT: s_cselect_b32 s5, s6, s5
-; GFX10-NEXT: s_add_i32 s3, s3, 1
+; GFX10-NEXT: s_add_i32 s0, s0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX10-NEXT: s_cbranch_scc0 .LBB3_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: srem32_invariant_denom:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_abs_i32 s2, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s3, 0, s2
+; GFX11-NEXT: s_abs_i32 s4, s2
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX11-NEXT: s_sub_i32 s0, 0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_i32 s3, s3, s4
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_add_i32 s4, s4, s5
+; GFX11-NEXT: s_mul_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_hi_u32 s5, s1, s0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_add_i32 s1, s1, s5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB3_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX11-NEXT: s_mul_i32 s5, s5, s2
+; GFX11-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX11-NEXT: s_mul_i32 s5, s5, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s5, s3, s5
-; GFX11-NEXT: s_sub_i32 s6, s5, s2
-; GFX11-NEXT: s_cmp_ge_u32 s5, s2
+; GFX11-NEXT: s_sub_i32 s5, s0, s5
+; GFX11-NEXT: s_sub_i32 s6, s5, s4
+; GFX11-NEXT: s_cmp_ge_u32 s5, s4
; GFX11-NEXT: s_cselect_b32 s5, s6, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s6, s5, s2
-; GFX11-NEXT: s_cmp_ge_u32 s5, s2
+; GFX11-NEXT: s_sub_i32 s6, s5, s4
+; GFX11-NEXT: s_cmp_ge_u32 s5, s4
; GFX11-NEXT: s_cselect_b32 s5, s6, s5
-; GFX11-NEXT: s_add_i32 s3, s3, 1
+; GFX11-NEXT: s_add_i32 s0, s0, 1
; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB3_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -748,12 +751,12 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: v_mov_b32_e32 v3, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -765,11 +768,12 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0
; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4
; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: global_store_short v5, v4, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v5, v4, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -807,13 +811,13 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: urem16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_and_b32 s0, s4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB5_1: ; %bb3
@@ -833,10 +837,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3
-; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2
+; GFX11-NEXT: v_mul_lo_u32 v4, v4, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4
-; GFX11-NEXT: global_store_b16 v5, v3, s[0:1]
+; GFX11-NEXT: global_store_b16 v5, v3, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB5_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -863,18 +867,18 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: sdiv16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB6_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_sext_i32_i16 s5, s4
+; GFX9-NEXT: s_sext_i32_i16 s5, s1
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT: s_xor_b32 s6, s5, s2
+; GFX9-NEXT: s_xor_b32 s6, s5, s4
; GFX9-NEXT: s_ashr_i32 s5, s6, 30
; GFX9-NEXT: s_or_b32 s5, s5, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
@@ -883,15 +887,16 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT: v_add_u16_e64 v2, s1, 1
; GFX9-NEXT: s_cselect_b32 s5, s5, 0
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s4
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9-NEXT: s_and_b32 s6, 0xffff, s1
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: v_add_u32_e32 v2, s5, v4
; GFX9-NEXT: s_lshl_b32 s5, s6, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_short v3, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v2, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -935,21 +940,21 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: sdiv16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s2, s2
+; GFX11-NEXT: s_sext_i32_i16 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB6_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_sext_i32_i16 s4, s3
-; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: v_add_nc_u16 v2, s1, 1
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT: s_xor_b32 s5, s4, s2
+; GFX11-NEXT: s_xor_b32 s5, s4, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_ashr_i32 s4, s5, 30
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
@@ -964,12 +969,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0|
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: v_readfirstlane_b32 s3, v2
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
-; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v2, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB6_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -996,18 +1001,18 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: srem16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB7_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_sext_i32_i16 s5, s4
+; GFX9-NEXT: s_sext_i32_i16 s5, s1
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT: s_xor_b32 s6, s5, s2
+; GFX9-NEXT: s_xor_b32 s6, s5, s4
; GFX9-NEXT: s_ashr_i32 s6, s6, 30
; GFX9-NEXT: s_or_b32 s8, s6, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
@@ -1016,17 +1021,19 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT: v_add_u16_e64 v2, s1, 1
; GFX9-NEXT: s_cselect_b32 s6, s8, 0
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: s_and_b32 s7, 0xffff, s4
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: v_add_u32_e32 v2, s6, v4
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX9-NEXT: s_lshl_b32 s6, s7, 1
+; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
-; GFX9-NEXT: global_store_short v3, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v2, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -1073,21 +1080,21 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: srem16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s2, s2
+; GFX11-NEXT: s_sext_i32_i16 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_sext_i32_i16 s4, s3
-; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: v_add_nc_u16 v2, s1, 1
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT: s_xor_b32 s5, s4, s2
+; GFX11-NEXT: s_xor_b32 s5, s4, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_ashr_i32 s5, s5, 30
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
@@ -1105,14 +1112,14 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: v_readfirstlane_b32 s3, v2
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: v_mov_b32_e32 v2, s5
-; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2
+; GFX11-NEXT: v_mul_lo_u32 v3, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3
-; GFX11-NEXT: global_store_b16 v2, v3, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v3, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB7_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 9da07ea..06a5816 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -52,7 +52,7 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -60,10 +60,10 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -102,18 +102,19 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -179,7 +180,7 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MulMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -190,7 +191,7 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -198,9 +199,9 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -242,21 +243,22 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MulMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -317,7 +319,7 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -328,7 +330,7 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -336,10 +338,10 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -378,18 +380,19 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -451,7 +454,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MixedTypedMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -462,7 +465,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -470,10 +473,10 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -515,21 +518,22 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MixedTypedMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -591,7 +595,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_alt_AddOperands:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -602,7 +606,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -610,10 +614,10 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -656,18 +660,19 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_alt_AddOperands:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -729,7 +734,7 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MixedExt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -740,7 +745,7 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -748,10 +753,10 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -793,21 +798,22 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MixedExt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -867,7 +873,7 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_SameVec:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -878,16 +884,16 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -931,22 +937,23 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_SameVec:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1009,7 +1016,7 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_v4i16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1020,7 +1027,7 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1028,10 +1035,10 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1070,18 +1077,19 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_v4i16:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1143,7 +1151,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_v4i16_Hi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1158,7 +1166,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1166,10 +1174,10 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1208,18 +1216,19 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_v4i16_Hi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1282,7 +1291,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_v4i16_Even:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1293,7 +1302,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1301,10 +1310,10 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1346,21 +1355,22 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Even:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1423,7 +1433,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_v4i16_Middle:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1434,7 +1444,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1442,10 +1452,10 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1487,21 +1497,22 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Middle:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1563,7 +1574,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_DiffIndex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1574,7 +1585,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -1582,10 +1593,10 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1627,21 +1638,22 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_DiffIndex:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1704,7 +1716,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1715,7 +1727,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -1723,11 +1735,11 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1773,14 +1785,15 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1788,9 +1801,9 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1855,7 +1868,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1866,7 +1879,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -1874,11 +1887,11 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1924,14 +1937,15 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1939,9 +1953,9 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2006,7 +2020,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2017,7 +2031,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -2025,11 +2039,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2079,14 +2093,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2094,10 +2109,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2163,7 +2178,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2174,7 +2189,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -2182,11 +2197,11 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s2
+; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2236,14 +2251,15 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2251,10 +2267,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2320,7 +2336,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_mul2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2331,7 +2347,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -2339,11 +2355,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2391,14 +2407,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2406,10 +2423,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2475,7 +2492,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_mul2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2486,7 +2503,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -2494,11 +2511,11 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2546,14 +2563,15 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2561,10 +2579,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2629,7 +2647,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2640,8 +2658,8 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -2770,7 +2788,7 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-LABEL: notsdot2_sext8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2781,7 +2799,7 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
@@ -2791,10 +2809,10 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2840,23 +2858,24 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notsdot2_sext8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0001
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index fdd9138..c148ba3 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -56,7 +56,7 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -66,14 +66,14 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -115,36 +115,38 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -236,7 +238,7 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -247,8 +249,8 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8
@@ -344,16 +346,16 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_i16 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_i16 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -437,7 +439,7 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -448,8 +450,8 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -529,16 +531,16 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -615,7 +617,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_multiuse_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -626,7 +628,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -636,15 +638,15 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s0
; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v8
; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -695,45 +697,47 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_multiuse_mul1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s2
+; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -819,7 +823,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -830,7 +834,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3
@@ -844,12 +848,12 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v1, v2, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v4, v6, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -894,36 +898,38 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1001,7 +1007,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1012,8 +1018,8 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -1111,15 +1117,16 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -1145,20 +1152,21 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc16_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_ashrrev_i16 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -1190,7 +1198,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1254,7 +1262,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_2ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1265,7 +1273,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8
@@ -1273,10 +1281,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1320,44 +1328,46 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_2ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_2ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1425,7 +1435,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1436,7 +1446,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -1445,12 +1455,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1498,44 +1508,46 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1610,7 +1622,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele_permuted:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1621,7 +1633,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v3
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
@@ -1630,12 +1642,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; GFX8-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1683,44 +1695,46 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1795,7 +1809,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_opt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1806,8 +1820,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8
@@ -1860,9 +1874,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_opt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
@@ -1871,14 +1886,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_opt:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1886,7 +1902,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1970,22 +1986,22 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
@@ -2001,20 +2017,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v5, v6, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -2026,20 +2042,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -2048,20 +2064,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v3, v1, s1
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2071,19 +2086,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v3, v0
-; GFX10-DL-NEXT: global_store_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2093,7 +2108,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2177,22 +2192,22 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
@@ -2205,20 +2220,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8
@@ -2229,21 +2244,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -2253,20 +2268,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2277,19 +2291,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2300,7 +2314,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2378,9 +2392,9 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_bad_source:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2390,122 +2404,122 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT: s_sext_i32_i16 s1, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v2, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v2, s2, v1
+; GFX8-NEXT: v_mad_i32_i24 v1, v2, s1, v1
; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_bad_source:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT: s_sext_i32_i16 s2, s2
+; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s8
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s2, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s1, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_bad_source:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0201
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201
-; GFX9-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT: s_sext_i32_i16 s4, s8
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4
-; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s2, v3
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s4, v3
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_bad_source:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT: s_sext_i32_i16 s1, s8
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s2, s3
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_bad_source:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT: s_load_b32 s8, s[0:1], 0x3c
; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT: s_sext_i32_i16 s1, s8
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s2, s3
+; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s1, s0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2586,7 +2600,7 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_commutative:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2597,7 +2611,7 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -2606,12 +2620,12 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2659,44 +2673,46 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_commutative:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_commutative:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2776,22 +2792,22 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
@@ -2803,20 +2819,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 8, 8
@@ -2827,21 +2843,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0
@@ -2851,20 +2867,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2875,19 +2890,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2898,7 +2913,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2986,7 +3001,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -3004,12 +3019,12 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8
@@ -3022,8 +3037,8 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3031,14 +3046,14 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
@@ -3048,52 +3063,52 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_4src:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501
-; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0501
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x5010c0c
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400
-; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s0
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s1
; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5
; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5
; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v3, s6
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_4src:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x3
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -3104,23 +3119,23 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_or_b32_e32 v0, v5, v0
; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_4src:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x3
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -3133,8 +3148,8 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3231,7 +3246,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_nonstandard_signed:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -3243,8 +3258,8 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -3327,10 +3342,11 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_nonstandard_signed:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@@ -3355,14 +3371,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_nonstandard_signed:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
@@ -3390,7 +3407,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 0b131ea..86aab8c 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -56,7 +56,7 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -66,14 +66,14 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -115,34 +115,36 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -227,7 +229,7 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -239,8 +241,8 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3
@@ -329,16 +331,16 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u16 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -423,7 +425,7 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -434,8 +436,8 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -515,16 +517,16 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -595,7 +597,7 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -606,8 +608,8 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3
@@ -684,14 +686,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -699,7 +701,7 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -765,7 +767,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_CommutationInsideMAD:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -776,8 +778,8 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -857,16 +859,16 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -943,7 +945,7 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_CommutationAccrossMADs:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -954,8 +956,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -1035,16 +1037,16 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1122,7 +1124,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_multiuse_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1133,7 +1135,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -1143,15 +1145,15 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1202,45 +1204,47 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_multiuse_mul1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s2
+; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1328,7 +1332,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_multiuse_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1339,7 +1343,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
@@ -1349,16 +1353,16 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v4
; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1408,46 +1412,48 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: s_add_i32 s2, s2, s2
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: s_add_i32 s0, s0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: v_add3_u32 v0, s2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_multiuse_add1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_bfe_u32 v3, v0, 8, 8
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-DL-NEXT: s_add_i32 s2, s2, s2
+; GFX11-DL-NEXT: s_add_i32 s0, s0, s0
; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_add3_u32 v0, s2, v2, v0
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_add3_u32 v0, s0, v2, v0
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1535,7 +1541,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX8-LABEL: notdot4_mixedtypes:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1547,8 +1553,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
@@ -1663,7 +1669,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1678,7 +1684,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0302
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1688,7 +1694,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1778,7 +1784,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX8-LABEL: notdot4_mixedtypes2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1790,8 +1796,8 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
@@ -1920,7 +1926,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1931,7 +1937,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_and_b32_e32 v9, 0xff, v0
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
@@ -1950,7 +1956,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3
; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2036,7 +2042,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2047,7 +2053,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8
@@ -2059,12 +2065,12 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2106,34 +2112,36 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2208,7 +2216,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2220,8 +2228,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -2315,16 +2323,17 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2348,21 +2357,22 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc16_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_lshrrev_b16 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_lshrrev_b16 v5, 8, v0
@@ -2391,7 +2401,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2462,7 +2472,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2473,8 +2483,8 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -2554,15 +2564,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2585,20 +2596,21 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
-; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc8_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -2630,7 +2642,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_mad_u16 v0, v4, v7, v0
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2691,7 +2703,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_2ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2702,7 +2714,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
@@ -2710,10 +2722,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2757,43 +2769,45 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_2ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_2ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2860,7 +2874,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2871,7 +2885,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -2880,12 +2894,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2933,43 +2947,45 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3043,7 +3059,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele_permuted:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3054,7 +3070,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
@@ -3063,12 +3079,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0
; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3116,43 +3132,45 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3228,7 +3246,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_opt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3239,8 +3257,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8
@@ -3293,9 +3311,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_opt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@@ -3303,14 +3322,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_opt:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -3318,7 +3338,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3402,22 +3422,22 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -3433,20 +3453,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -3458,20 +3478,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -3480,20 +3500,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v3, v1, s1
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3502,19 +3521,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v0, s0
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3524,7 +3543,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3609,22 +3628,22 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -3637,20 +3656,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1
@@ -3661,21 +3680,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -3685,20 +3704,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3708,19 +3726,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3731,7 +3749,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3811,9 +3829,9 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_bad_source:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -3823,122 +3841,122 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT: s_and_b32 s1, s8, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, s2, v1
+; GFX8-NEXT: v_mad_u32_u24 v1, v2, s1, v1
; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_bad_source:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NODL-NEXT: s_and_b32 s1, s8, 0xffff
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s2, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s1, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_bad_source:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0201
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201
-; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4
-; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s2, v3
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s4, v3
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_bad_source:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s2, s3
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s1, s0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_bad_source:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT: s_load_b32 s8, s[0:1], 0x3c
; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s2, s3
+; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s1, s0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4019,7 +4037,7 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_commutative:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4030,7 +4048,7 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -4039,12 +4057,12 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4092,43 +4110,45 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_commutative:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_commutative:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4208,22 +4228,22 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
@@ -4235,20 +4255,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8
@@ -4259,21 +4279,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0
@@ -4283,20 +4303,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4306,19 +4325,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020101
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4329,7 +4348,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4417,7 +4436,7 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -4435,12 +4454,12 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8
@@ -4453,8 +4472,8 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4462,14 +4481,14 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
@@ -4479,52 +4498,52 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_4src:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501
-; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0501
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x5010c0c
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400
-; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s0
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s1
; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5
; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5
; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v3, s6
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_4src:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x3
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -4535,22 +4554,22 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_4src:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x3
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -4563,8 +4582,8 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s2
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s0
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4667,7 +4686,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_multi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4678,7 +4697,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -4686,7 +4705,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s2
+; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s0
; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v1
; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3
; GFX8-NEXT: v_bfe_u32 v11, v1, 16, 8
@@ -4702,8 +4721,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_mad_u32_u24 v0, v10, v6, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4762,37 +4781,39 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32_multi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX10-DL-NEXT: global_load_dword v3, v2, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v2, v1, v0, 0x6040200
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v4, v3, v3, 0x2000200
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v0, 0x7050301
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s2
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s0
; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0x3010301
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_multi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v2, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v3, v1, v0, 0x6040200
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4801,10 +4822,10 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v2, v2, v2, 0x3010301
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s2
+; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4915,7 +4936,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_hilo:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4928,8 +4949,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -4982,9 +5003,10 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_hilo:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
@@ -4992,14 +5014,15 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_hilo:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5007,7 +5030,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5088,7 +5111,7 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_lohi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -5101,8 +5124,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -5160,9 +5183,10 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_lohi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
@@ -5173,14 +5197,15 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_lohi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5192,7 +5217,7 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5273,7 +5298,7 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_hihi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -5288,8 +5313,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2
; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8
@@ -5347,9 +5372,10 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_hihi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
@@ -5360,14 +5386,15 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_hihi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5379,7 +5406,7 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5456,7 +5483,7 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v8i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -5474,8 +5501,8 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -5513,28 +5540,30 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v8i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v8i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5617,7 +5646,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v16i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5630,8 +5659,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2]
; GFX8-NEXT: flat_load_dword v4, v[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4
@@ -5696,10 +5725,11 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v16i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; kill: killed $vgpr5
; GFX10-DL-NEXT: ; kill: killed $vgpr4
; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
@@ -5712,15 +5742,16 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v16i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7]
@@ -5731,7 +5762,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5814,10 +5845,10 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v256i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_movk_i32 s2, 0xfc
+; GFX8-NEXT: s_movk_i32 s0, 0xfc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v1
@@ -5826,11 +5857,11 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -5890,10 +5921,11 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v256i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252
@@ -5903,15 +5935,16 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0x1000302
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v256i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252
@@ -5921,7 +5954,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x1000302
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5997,7 +6030,7 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_anyext:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -6008,17 +6041,17 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -6063,41 +6096,43 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_anyext:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0500
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_anyext:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0500
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 8c53d26..036965d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -63,11 +63,15 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -77,11 +81,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -93,7 +93,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
@@ -109,8 +109,8 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -189,44 +189,44 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -372,11 +372,16 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc16:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -386,14 +391,9 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -599,20 +599,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-LABEL: idot8_acc16:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -670,27 +670,26 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
-; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -748,7 +747,7 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -893,11 +892,16 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc8:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -907,14 +911,9 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -1120,20 +1119,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-LABEL: idot8_acc8:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -1191,27 +1190,26 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
-; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -1269,7 +1267,7 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1401,11 +1399,15 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_multiuses_mul1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1415,11 +1417,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -1430,7 +1428,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16
; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
@@ -1449,8 +1447,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1560,18 +1558,18 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
@@ -1585,7 +1583,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2
+; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s0
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
@@ -1605,25 +1603,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5
-; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
@@ -1637,7 +1635,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2
+; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s0
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
@@ -1657,7 +1655,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1788,11 +1786,15 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1802,11 +1804,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4
@@ -1826,7 +1824,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0
@@ -1834,8 +1832,8 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1914,44 +1912,44 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2061,11 +2059,16 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2075,14 +2078,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
@@ -2315,19 +2313,19 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2401,26 +2399,26 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2494,7 +2492,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2604,11 +2602,16 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2618,14 +2621,9 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3
@@ -2890,19 +2888,19 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2983,26 +2981,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
@@ -3083,7 +3081,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 3828fa5..f29908a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -61,11 +61,15 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -75,11 +79,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -99,7 +99,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -107,8 +107,8 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -187,22 +187,22 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -332,11 +332,15 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -346,13 +350,9 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -650,11 +650,15 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc8:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -664,13 +668,9 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -969,11 +969,15 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc4:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -983,13 +987,9 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -1276,11 +1276,15 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_CommutationInsideMAD:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1290,13 +1294,9 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -1582,11 +1582,15 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_multiuses_mul1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1596,11 +1600,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -1620,7 +1620,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16
; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16
; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3
@@ -1630,8 +1630,8 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1741,18 +1741,18 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1768,7 +1768,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0
; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11
@@ -1786,7 +1786,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1916,11 +1916,15 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1930,11 +1934,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -1954,7 +1954,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -1962,8 +1962,8 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2042,22 +2042,22 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2152,11 +2152,15 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2166,13 +2170,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -2324,20 +2324,20 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-LABEL: udot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2381,7 +2381,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2475,11 +2475,15 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2489,13 +2493,9 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3
; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4
@@ -2680,19 +2680,19 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2743,7 +2743,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8
; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2838,11 +2838,15 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc4_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2852,13 +2856,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -3013,20 +3013,20 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-LABEL: udot8_acc4_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -3071,7 +3071,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
-; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -3156,7 +3156,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-LABEL: udot8_variant1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3167,7 +3167,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 15, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4
@@ -3187,7 +3187,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0
@@ -3195,8 +3195,8 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_mad_u32_u24 v0, v11, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v13, v12, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v15, v14, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3261,18 +3261,19 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
;
; GFX10-DL-LABEL: udot8_variant1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %v2addr,
ptr addrspace(1) %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index f7a0e29..66e54aa 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -17,13 +17,13 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
;
; VI-LABEL: i64_imm_inline_lo:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 5
; VI-NEXT: v_mov_b32_e32 v1, 0x12345678
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
@@ -45,13 +45,13 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) {
;
; VI-LABEL: i64_imm_inline_hi:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x12345678
; VI-NEXT: v_mov_b32_e32 v1, 5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
store i64 21780256376, ptr addrspace(1) %out ; 0x0000000512345678
@@ -72,13 +72,13 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_imm_neg_0.0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_bfrev_b32_e32 v1, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store i64 -9223372036854775808, ptr addrspace(1) %out
ret void
@@ -97,12 +97,12 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_neg_0.0_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store i32 -2147483648, ptr addrspace(1) %out
ret void
@@ -121,12 +121,12 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0.0, ptr addrspace(1) %out
ret void
@@ -145,12 +145,12 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_imm_neg_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -0.0, ptr addrspace(1) %out
ret void
@@ -169,12 +169,12 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0.5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0.5, ptr addrspace(1) %out
ret void
@@ -193,12 +193,12 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -0.5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -0.5, ptr addrspace(1) %out
ret void
@@ -217,12 +217,12 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 1.0, ptr addrspace(1) %out
ret void
@@ -241,12 +241,12 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -1.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -1.0, ptr addrspace(1) %out
ret void
@@ -265,12 +265,12 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 2.0, ptr addrspace(1) %out
ret void
@@ -289,12 +289,12 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -2.0, ptr addrspace(1) %out
ret void
@@ -313,12 +313,12 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 4.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 4.0, ptr addrspace(1) %out
ret void
@@ -337,12 +337,12 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -4.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -4.0, ptr addrspace(1) %out
ret void
@@ -361,12 +361,12 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_inv_2pi_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0.15915494
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0x3FC45F3060000000, ptr addrspace(1) %out
ret void
@@ -385,12 +385,12 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out)
;
; VI-LABEL: store_inline_imm_m_inv_2pi_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0xBFC45F3060000000, ptr addrspace(1) %out
ret void
@@ -409,12 +409,12 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_literal_imm_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x45800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 4096.0, ptr addrspace(1) %out
ret void
@@ -434,13 +434,13 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0.0
store float %y, ptr addrspace(1) %out
@@ -461,13 +461,13 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 0.5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 0.5
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0.5
store float %y, ptr addrspace(1) %out
@@ -488,13 +488,13 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -0.5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -0.5
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -0.5
store float %y, ptr addrspace(1) %out
@@ -515,13 +515,13 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 1.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 1.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 1.0
store float %y, ptr addrspace(1) %out
@@ -542,13 +542,13 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -1.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -1.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -1.0
store float %y, ptr addrspace(1) %out
@@ -569,13 +569,13 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 2.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 2.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 2.0
store float %y, ptr addrspace(1) %out
@@ -596,13 +596,13 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -2.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -2.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -2.0
store float %y, ptr addrspace(1) %out
@@ -623,13 +623,13 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 4.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 4.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 4.0
store float %y, ptr addrspace(1) %out
@@ -650,13 +650,13 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -4.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -4.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -4.0
store float %y, ptr addrspace(1) %out
@@ -684,20 +684,20 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out,
;
; VI-LABEL: commute_add_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, 0.5, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 0.5
@@ -726,20 +726,20 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: commute_add_literal_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, 0x44800000, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 1024.0
@@ -761,13 +761,13 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x)
;
; VI-LABEL: add_inline_imm_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36a0000000000000
store float %y, ptr addrspace(1) %out
@@ -788,13 +788,13 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x)
;
; VI-LABEL: add_inline_imm_2_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 2
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36b0000000000000
store float %y, ptr addrspace(1) %out
@@ -815,13 +815,13 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 16
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 16
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36e0000000000000
store float %y, ptr addrspace(1) %out
@@ -843,14 +843,14 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float
;
; VI-LABEL: add_inline_imm_neg_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -1
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -1
@@ -874,14 +874,14 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float
;
; VI-LABEL: add_inline_imm_neg_2_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -2
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -2
@@ -905,14 +905,14 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa
;
; VI-LABEL: add_inline_imm_neg_16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -16
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -16
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -16
@@ -935,13 +935,13 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_63_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 63
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 63
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36ff800000000000
store float %y, ptr addrspace(1) %out
@@ -962,13 +962,13 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_64_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 64
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 64
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x3700000000000000
store float %y, ptr addrspace(1) %out
@@ -990,12 +990,12 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_0.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0.0
store double %y, ptr addrspace(1) %out
@@ -1017,12 +1017,12 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_0.5_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0.5
store double %y, ptr addrspace(1) %out
@@ -1044,12 +1044,12 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_0.5_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -0.5
store double %y, ptr addrspace(1) %out
@@ -1071,12 +1071,12 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_1.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 1.0
store double %y, ptr addrspace(1) %out
@@ -1098,12 +1098,12 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_1.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -1.0
store double %y, ptr addrspace(1) %out
@@ -1125,12 +1125,12 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_2.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 2.0
store double %y, ptr addrspace(1) %out
@@ -1152,12 +1152,12 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_2.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -2.0
store double %y, ptr addrspace(1) %out
@@ -1179,12 +1179,12 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_4.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 4.0
store double %y, ptr addrspace(1) %out
@@ -1206,12 +1206,12 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_4.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -4.0
store double %y, ptr addrspace(1) %out
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_inv_2pi_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x3fc45f306dc9c882
store double %y, ptr addrspace(1) %out
@@ -1264,14 +1264,14 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d
; VI-LABEL: add_m_inv_2pi_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xbfc45f306dc9c882
store double %y, ptr addrspace(1) %out
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32]
; VI-LABEL: add_inline_imm_1_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000001
store double %y, ptr addrspace(1) %out
@@ -1320,12 +1320,12 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32]
; VI-LABEL: add_inline_imm_2_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000002
store double %y, ptr addrspace(1) %out
@@ -1347,12 +1347,12 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_16_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000010
store double %y, ptr addrspace(1) %out
@@ -1373,13 +1373,13 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_1_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, -1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xffffffffffffffff
store double %y, ptr addrspace(1) %out
@@ -1400,13 +1400,13 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_2_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -2
; VI-NEXT: v_mov_b32_e32 v1, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffffe
store double %y, ptr addrspace(1) %out
@@ -1427,13 +1427,13 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_16_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -16
; VI-NEXT: v_mov_b32_e32 v1, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffff0
store double %y, ptr addrspace(1) %out
@@ -1455,12 +1455,12 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_63_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x000000000000003F
store double %y, ptr addrspace(1) %out
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_64_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000040
store double %y, ptr addrspace(1) %out
@@ -1508,13 +1508,13 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0.0, ptr addrspace(1) %out
ret void
@@ -1534,13 +1534,13 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out)
;
; VI-LABEL: store_literal_imm_neg_0.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_bfrev_b32_e32 v1, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -0.0, ptr addrspace(1) %out
ret void
@@ -1560,13 +1560,13 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.5_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x3fe00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0.5, ptr addrspace(1) %out
ret void
@@ -1586,13 +1586,13 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_0.5_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbfe00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -0.5, ptr addrspace(1) %out
ret void
@@ -1612,13 +1612,13 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_1.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 1.0, ptr addrspace(1) %out
ret void
@@ -1638,13 +1638,13 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_1.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbff00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -1.0, ptr addrspace(1) %out
ret void
@@ -1664,13 +1664,13 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_2.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 2.0, ptr addrspace(1) %out
ret void
@@ -1690,13 +1690,13 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_2.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, -2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -2.0, ptr addrspace(1) %out
ret void
@@ -1716,13 +1716,13 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_4.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40100000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 4.0, ptr addrspace(1) %out
ret void
@@ -1742,13 +1742,13 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_4.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xc0100000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -4.0, ptr addrspace(1) %out
ret void
@@ -1768,13 +1768,13 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inv_2pi_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0x3fc45f306dc9c882, ptr addrspace(1) %out
ret void
@@ -1794,13 +1794,13 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out)
;
; VI-LABEL: store_inline_imm_m_inv_2pi_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0xbfc45f306dc9c882, ptr addrspace(1) %out
ret void
@@ -1820,13 +1820,13 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_literal_imm_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40b00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 4096.0, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index 3cabe41..44e8ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
; GCN-NEXT: liveins: $vgpr0, $sgpr2_sgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
- ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1
; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
; GCN-NEXT: renamable $sgpr4 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 18d5c05..2ecc51d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -4,28 +4,28 @@
define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) {
; GCN-LABEL: float4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_mov_b32_e32 v0, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -37,14 +37,14 @@ entry:
define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: float4_inselt_undef:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -56,24 +56,24 @@ entry:
define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) {
; GCN-LABEL: int4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
-; GCN-NEXT: s_cselect_b32 s3, s7, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
-; GCN-NEXT: s_cselect_b32 s6, s6, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
+; GCN-NEXT: s_cselect_b32 s0, s7, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
+; GCN-NEXT: s_cselect_b32 s1, s6, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, s4, 1
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, 1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -110,27 +110,27 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec
; GCN-LABEL: float8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x64
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s12, s[0:1], 0x64
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v4, s8
; GCN-NEXT: v_mov_b32_e32 v5, s9
; GCN-NEXT: v_mov_b32_e32 v6, s10
; GCN-NEXT: v_mov_b32_e32 v7, s11
-; GCN-NEXT: v_mov_b32_e32 v9, s3
+; GCN-NEXT: s_mov_b32 m0, s12
+; GCN-NEXT: v_mov_b32_e32 v9, s1
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -336,56 +336,56 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec,
; GCN-LABEL: half8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s3, s7, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 7
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_lshr_b32 s0, s7, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 7
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 6
+; GCN-NEXT: s_cmp_lg_u32 s8, 6
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 16
+; GCN-NEXT: s_lshr_b32 s0, s6, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 5
+; GCN-NEXT: s_cmp_lg_u32 s8, 5
; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 4
+; GCN-NEXT: s_cmp_lg_u32 s8, 4
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 16
+; GCN-NEXT: s_lshr_b32 s0, s5, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 16
+; GCN-NEXT: s_lshr_b32 s0, s4, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -468,98 +468,98 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
; GCN-LABEL: byte16_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s3, s7, 24
-; GCN-NEXT: s_cmp_lg_u32 s2, 15
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_lshr_b32 s0, s7, 24
+; GCN-NEXT: s_cmp_lg_u32 s8, 15
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s7, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 14
+; GCN-NEXT: s_lshr_b32 s0, s7, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 14
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s7, 8
+; GCN-NEXT: s_lshr_b32 s0, s7, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 13
+; GCN-NEXT: s_cmp_lg_u32 s8, 13
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 12
+; GCN-NEXT: s_cmp_lg_u32 s8, 12
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT: s_lshr_b32 s3, s6, 24
+; GCN-NEXT: s_lshr_b32 s0, s6, 24
; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 11
+; GCN-NEXT: s_cmp_lg_u32 s8, 11
; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 10
+; GCN-NEXT: s_lshr_b32 s0, s6, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 10
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 8
+; GCN-NEXT: s_lshr_b32 s0, s6, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 9
+; GCN-NEXT: s_cmp_lg_u32 s8, 9
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 8
+; GCN-NEXT: s_cmp_lg_u32 s8, 8
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT: s_lshr_b32 s3, s5, 24
+; GCN-NEXT: s_lshr_b32 s0, s5, 24
; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 7
+; GCN-NEXT: s_cmp_lg_u32 s8, 7
; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 6
+; GCN-NEXT: s_lshr_b32 s0, s5, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 6
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 8
+; GCN-NEXT: s_lshr_b32 s0, s5, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 5
+; GCN-NEXT: s_cmp_lg_u32 s8, 5
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 4
+; GCN-NEXT: s_cmp_lg_u32 s8, 4
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT: s_lshr_b32 s3, s4, 24
+; GCN-NEXT: s_lshr_b32 s0, s4, 24
; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_lshr_b32 s0, s4, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 8
+; GCN-NEXT: s_lshr_b32 s0, s4, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
@@ -567,8 +567,8 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc
; GCN-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -580,22 +580,22 @@ entry:
define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) {
; GCN-LABEL: double2_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_eq_u32 s2, 1
-; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7
-; GCN-NEXT: s_cselect_b32 s6, 0, s6
-; GCN-NEXT: s_cmp_eq_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5
+; GCN-NEXT: s_cmp_eq_u32 s8, 1
+; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
+; GCN-NEXT: s_cselect_b32 s1, 0, s6
+; GCN-NEXT: s_cmp_eq_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5
; GCN-NEXT: s_cselect_b32 s4, 0, s4
-; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s2
-; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -607,48 +607,48 @@ entry:
define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
; GCN-LABEL: double5_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4
-; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84
-; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
-; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; GCN-NEXT: s_load_dword s14, s[0:1], 0xa4
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x84
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_eq_u32 s12, 4
-; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
-; GCN-NEXT: s_cselect_b32 s8, 0, s8
-; GCN-NEXT: s_cmp_eq_u32 s12, 1
+; GCN-NEXT: s_cmp_eq_u32 s14, 4
; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3
; GCN-NEXT: s_cselect_b32 s2, 0, s2
-; GCN-NEXT: s_cmp_eq_u32 s12, 0
-; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1
-; GCN-NEXT: s_cselect_b32 s14, 0, s0
-; GCN-NEXT: s_cmp_eq_u32 s12, 3
-; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
-; GCN-NEXT: s_cselect_b32 s1, 0, s6
-; GCN-NEXT: s_cmp_eq_u32 s12, 2
+; GCN-NEXT: s_cmp_eq_u32 s14, 1
+; GCN-NEXT: s_cselect_b32 s7, 0x3ff00000, s7
+; GCN-NEXT: s_cselect_b32 s6, 0, s6
+; GCN-NEXT: s_cmp_eq_u32 s14, 0
; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5
; GCN-NEXT: s_cselect_b32 s4, 0, s4
+; GCN-NEXT: s_cmp_eq_u32 s14, 3
+; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s11
+; GCN-NEXT: s_cselect_b32 s1, 0, s10
+; GCN-NEXT: s_cmp_eq_u32 s14, 2
+; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
+; GCN-NEXT: s_cselect_b32 s8, 0, s8
; GCN-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NEXT: s_add_u32 s0, s10, 16
+; GCN-NEXT: s_add_u32 s0, s12, 16
; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_addc_u32 s1, s11, 0
+; GCN-NEXT: s_addc_u32 s1, s13, 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NEXT: s_add_u32 s0, s10, 32
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NEXT: s_addc_u32 s1, s11, 0
+; GCN-NEXT: v_mov_b32_e32 v4, s12
+; GCN-NEXT: s_add_u32 s0, s12, 32
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NEXT: s_addc_u32 s1, s13, 0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -661,12 +661,12 @@ entry:
define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) {
; GCN-LABEL: double8_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4
+; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
+; GCN-NEXT: s_lshl_b32 s0, s20, 1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
@@ -683,29 +683,29 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v
; GCN-NEXT: v_mov_b32_e32 v13, s17
; GCN-NEXT: v_mov_b32_e32 v14, s18
; GCN-NEXT: v_mov_b32_e32 v15, s19
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v16
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -773,11 +773,12 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double>
; GCN-NEXT: s_load_dword s2, s[0:1], 0x124
; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: s_lshl_b32 s2, s2, 1
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
@@ -809,53 +810,53 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double>
; GCN-NEXT: v_mov_b32_e32 v29, s17
; GCN-NEXT: v_mov_b32_e32 v30, s18
; GCN-NEXT: v_mov_b32_e32 v31, s19
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s2, 0x70
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v32
-; GCN-NEXT: v_mov_b32_e32 v33, s3
-; GCN-NEXT: v_mov_b32_e32 v32, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v33, s1
+; GCN-NEXT: v_mov_b32_e32 v32, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x60
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v29, s3
-; GCN-NEXT: v_mov_b32_e32 v28, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v29, s1
+; GCN-NEXT: v_mov_b32_e32 v28, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x50
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v25, s3
-; GCN-NEXT: v_mov_b32_e32 v24, s2
-; GCN-NEXT: s_add_u32 s2, s0, 64
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v25, s1
+; GCN-NEXT: v_mov_b32_e32 v24, s0
+; GCN-NEXT: s_add_u32 s0, s2, 64
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v21, s3
-; GCN-NEXT: v_mov_b32_e32 v20, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v21, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -875,12 +876,14 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_load_dword s4, s[0:1], 0x124
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v28, s2
+; GCN-NEXT: v_mov_b32_e32 v29, s3
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s2, s4, 1
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v4, s8
; GCN-NEXT: v_mov_b32_e32 v5, s9
@@ -906,49 +909,48 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: v_mov_b32_e32 v25, s21
; GCN-NEXT: v_mov_b32_e32 v26, s22
; GCN-NEXT: v_mov_b32_e32 v27, s23
-; GCN-NEXT: v_mov_b32_e32 v29, s3
-; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_add_u32 s2, s0, 0x50
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s2, 0x50
; GCN-NEXT: v_movreld_b32_e32 v1, v32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v31, s3
-; GCN-NEXT: v_mov_b32_e32 v30, s2
-; GCN-NEXT: s_add_u32 s2, s0, 64
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NEXT: v_mov_b32_e32 v31, s1
+; GCN-NEXT: v_mov_b32_e32 v30, s0
+; GCN-NEXT: s_add_u32 s0, s2, 64
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v21, s3
-; GCN-NEXT: v_mov_b32_e32 v20, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v21, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x70
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_add_u32 s0, s0, 0x60
+; GCN-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_add_u32 s0, s2, 0x60
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29]
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 68427e8..eb7c587 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1741,20 +1741,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
; VI-LABEL: s_dynamic_insertelement_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: s_load_dword s8, s[4:5], 0x10
+; VI-NEXT: s_load_dword s10, s[4:5], 0x10
; VI-NEXT: s_mov_b32 s7, 0x1100f000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_lshl_b32 s0, s8, 3
+; VI-NEXT: s_lshl_b32 s0, s10, 3
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0
-; VI-NEXT: s_and_b32 s9, s1, 0x5050505
+; VI-NEXT: s_and_b32 s3, s1, 0x5050505
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; VI-NEXT: s_and_b32 s8, s0, 0x5050505
-; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
+; VI-NEXT: s_andn2_b64 s[8:9], s[8:9], s[0:1]
+; VI-NEXT: s_and_b32 s2, s0, 0x5050505
+; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 1313460..e351b6d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -907,12 +907,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -936,12 +936,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB7_2:
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX90A-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_endpgm
;
@@ -966,12 +966,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: .LBB7_2:
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
@@ -995,12 +995,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-NEXT: .LBB7_2:
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_endpgm
;
@@ -1025,13 +1025,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: .LBB7_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
@@ -1056,13 +1056,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB7_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
%val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
@@ -1095,12 +1095,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -1123,12 +1123,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: .LBB8_2:
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX90A-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_endpgm
;
@@ -1151,12 +1151,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10-NEXT: .LBB8_2:
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
@@ -1179,12 +1179,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: .LBB8_2:
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_endpgm
;
@@ -1208,13 +1208,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB8_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
@@ -1238,13 +1238,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: .LBB8_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a344128..aab7b57 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
;
; VI-LABEL: i8_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -92,13 +92,13 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
;
; VI-LABEL: i8_zext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -167,13 +167,13 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
;
; VI-LABEL: i8_sext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i8 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sext_i32_i8 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -242,13 +242,13 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
;
; VI-LABEL: i16_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -314,13 +314,13 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
;
; VI-LABEL: i16_zext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -389,13 +389,13 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
;
; VI-LABEL: i16_sext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sext_i32_i16 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -858,18 +858,18 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
;
; VI-LABEL: v3i8_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_u32 s0, s0, 2
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: s_add_u32 s0, s2, 2
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: s_endpgm
@@ -1118,13 +1118,13 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
; VI-LABEL: v3i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -1197,13 +1197,13 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
; VI-LABEL: v3f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -1396,15 +1396,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
;
; VI-LABEL: v4i32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -1470,15 +1470,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
;
; VI-LABEL: v4f32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -1688,19 +1688,19 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16>
; VI-LABEL: v5i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dword s6, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 8
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: s_add_u32 s0, s2, 8
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: flat_store_short v[2:3], v4
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -1920,22 +1920,22 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32>
;
; VI-LABEL: v5i32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dword s7, s[0:1], 0x54
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s8, s[0:1], 0x54
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s6, s4, 16
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_addc_u32 s7, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2018,22 +2018,22 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float
;
; VI-LABEL: v5f32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dword s7, s[0:1], 0x54
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s8, s[0:1], 0x54
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s6, s4, 16
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s7, s5, 0
-; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s8
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: flat_store_dword v[1:2], v3
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2124,32 +2124,32 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64>
;
; VI-LABEL: v5i64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
-; VI-NEXT: v_mov_b32_e32 v2, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v2, s13
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2266,32 +2266,32 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
;
; VI-LABEL: v5f64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
-; VI-NEXT: v_mov_b32_e32 v2, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v2, s13
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2649,15 +2649,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
;
; VI-LABEL: v8i16_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2904,23 +2904,23 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; VI-LABEL: v8i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3015,23 +3015,23 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
; VI-LABEL: v8f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3120,15 +3120,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
;
; VI-LABEL: v16i8_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3577,23 +3577,23 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; VI-LABEL: v16i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4045,41 +4045,41 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
; VI-LABEL: v16i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: s_add_u32 s0, s2, 48
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 32
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4233,41 +4233,41 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
; VI-LABEL: v16f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: s_add_u32 s0, s2, 48
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 32
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4401,12 +4401,12 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
;
; VI-LABEL: kernel_arg_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -4463,12 +4463,12 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: f64_kernel_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -4652,13 +4652,13 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
;
; VI-LABEL: i1_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4743,13 +4743,13 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_zext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4816,14 +4816,14 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_zext_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -4891,13 +4891,13 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_sext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s2, s2, 0x10000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_bfe_i32 s0, s4, 0x10000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4967,13 +4967,13 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; VI-LABEL: i1_arg_sext_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x10000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -5089,25 +5089,25 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
;
; VI-LABEL: struct_argument_alignment:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dword s6, s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dword s7, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x44
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -5254,14 +5254,14 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28
+; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v7, s4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dword v[2:3], v7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
@@ -5413,32 +5413,32 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
;
; VI-LABEL: struct_argument_alignment_after:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s8, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; VI-NEXT: s_load_dword s9, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
+; VI-NEXT: s_load_dword s10, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; VI-NEXT: s_load_dword s11, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x54
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -5902,12 +5902,12 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
; VI-LABEL: byref_align_constant_i32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x124
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
index 142a6ed..1f14da1 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
@@ -80,25 +80,25 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: ds_min_f32 v2, v0 offset:64
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_min_rtn_f32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; VI-NEXT: s_endpgm
@@ -159,20 +159,20 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; GFX11-LABEL: lds_ds_fmin:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
; GFX11-NEXT: ds_min_f32 v2, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_min_rtn_f32 v0, v3, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmin:
@@ -235,26 +235,26 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_waitcnt lgkmcnt(0)
-; G_VI-NEXT: s_add_i32 s2, s2, 4
-; G_VI-NEXT: s_lshl_b32 s3, s2, 3
; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
; G_VI-NEXT: s_mov_b32 m0, -1
+; G_VI-NEXT: s_waitcnt lgkmcnt(0)
+; G_VI-NEXT: s_add_i32 s4, s4, 4
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s0
; G_VI-NEXT: ds_min_rtn_f32 v1, v1, v0
-; G_VI-NEXT: s_lshl_b32 s2, s2, 4
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: ds_min_f32 v2, v0
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_min_rtn_f32 v0, v0, v1
-; G_VI-NEXT: v_mov_b32_e32 v1, s0
+; G_VI-NEXT: v_mov_b32_e32 v1, s2
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; G_VI-NEXT: s_endpgm
@@ -289,49 +289,48 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; G_GFX10-LABEL: lds_ds_fmin:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT: s_mov_b32 s6, -1
-; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT: s_add_u32 s4, s4, s3
-; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX10-NEXT: s_mov_b32 s10, -1
+; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
+; G_GFX10-NEXT: s_add_u32 s8, s8, s3
+; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_add_i32 s2, s2, 4
-; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
+; G_GFX10-NEXT: s_add_i32 s4, s2, 4
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s0
; G_GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: ds_min_f32 v2, v1
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s2
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; G_GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX11-LABEL: lds_ds_fmin:
; G_GFX11: ; %bb.0:
-; G_GFX11-NEXT: s_clause 0x1
; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT: v_mov_b32_e32 v2, s2
+; G_GFX11-NEXT: s_add_i32 s4, s2, 4
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX11-NEXT: v_mov_b32_e32 v2, s0
; G_GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1
+; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX11-NEXT: v_mov_b32_e32 v3, s3
; G_GFX11-NEXT: ds_min_f32 v2, v1
-; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_min_rtn_f32 v0, v3, v0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b32 off, v0, s0
+; G_GFX11-NEXT: scratch_store_b32 off, v0, s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -406,25 +405,25 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: ds_max_f32 v2, v0 offset:64
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_max_rtn_f32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; VI-NEXT: s_endpgm
@@ -485,20 +484,20 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; GFX11-LABEL: lds_ds_fmax:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
; GFX11-NEXT: ds_max_f32 v2, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_max_rtn_f32 v0, v3, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmax:
@@ -561,26 +560,26 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_waitcnt lgkmcnt(0)
-; G_VI-NEXT: s_add_i32 s2, s2, 4
-; G_VI-NEXT: s_lshl_b32 s3, s2, 3
; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
; G_VI-NEXT: s_mov_b32 m0, -1
+; G_VI-NEXT: s_waitcnt lgkmcnt(0)
+; G_VI-NEXT: s_add_i32 s4, s4, 4
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s0
; G_VI-NEXT: ds_max_rtn_f32 v1, v1, v0
-; G_VI-NEXT: s_lshl_b32 s2, s2, 4
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: ds_max_f32 v2, v0
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_max_rtn_f32 v0, v0, v1
-; G_VI-NEXT: v_mov_b32_e32 v1, s0
+; G_VI-NEXT: v_mov_b32_e32 v1, s2
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; G_VI-NEXT: s_endpgm
@@ -615,49 +614,48 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; G_GFX10-LABEL: lds_ds_fmax:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT: s_mov_b32 s6, -1
-; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT: s_add_u32 s4, s4, s3
-; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX10-NEXT: s_mov_b32 s10, -1
+; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
+; G_GFX10-NEXT: s_add_u32 s8, s8, s3
+; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_add_i32 s2, s2, 4
-; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
+; G_GFX10-NEXT: s_add_i32 s4, s2, 4
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s0
; G_GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: ds_max_f32 v2, v1
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s2
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; G_GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX11-LABEL: lds_ds_fmax:
; G_GFX11: ; %bb.0:
-; G_GFX11-NEXT: s_clause 0x1
; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT: v_mov_b32_e32 v2, s2
+; G_GFX11-NEXT: s_add_i32 s4, s2, 4
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX11-NEXT: v_mov_b32_e32 v2, s0
; G_GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1
+; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX11-NEXT: v_mov_b32_e32 v3, s3
; G_GFX11-NEXT: ds_max_f32 v2, v1
-; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_max_rtn_f32 v0, v3, v0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b32 off, v0, s0
+; G_GFX11-NEXT: scratch_store_b32 off, v0, s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -740,28 +738,28 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v5, s2
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: ds_min_f64 v5, v[0:1] offset:64
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT: s_add_i32 s1, s0, 4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_i32 s0, s2, 4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -827,22 +825,22 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; GFX11-LABEL: lds_ds_fmin_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; GFX11-NEXT: v_mov_b32_e32 v5, s1
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
; GFX11-NEXT: ds_min_f64 v4, v[0:1] offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmin_f64:
@@ -917,30 +915,30 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
-; G_VI-NEXT: s_mov_b32 s2, 0
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_VI-NEXT: s_mov_b32 s0, 0
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_mov_b32 s3, 0x40450000
+; G_VI-NEXT: s_mov_b32 s1, 0x40450000
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: s_add_i32 s4, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v0, s2
-; G_VI-NEXT: s_lshl_b32 s2, s4, 3
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: v_mov_b32_e32 v0, s0
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s1
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: s_mov_b32 m0, -1
; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT: s_lshl_b32 s2, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v4, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v4, s0
; G_VI-NEXT: ds_min_f64 v4, v[0:1]
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT: v_mov_b32_e32 v2, s0
-; G_VI-NEXT: s_add_u32 s0, s0, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_add_u32 s0, s2, 4
; G_VI-NEXT: v_mov_b32_e32 v3, s0
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1013,24 +1011,25 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; G_GFX11-LABEL: lds_ds_fmin_f64:
; G_GFX11: ; %bb.0:
; G_GFX11-NEXT: s_clause 0x1
-; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; G_GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_mov_b32 s0, 0
+; G_GFX11-NEXT: s_mov_b32 s1, 0x40450000
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s4, s2, 4
-; G_GFX11-NEXT: s_mov_b32 s2, 0
-; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT: s_add_i32 s4, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v5, s3
; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT: v_mov_b32_e32 v4, s2
+; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; G_GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s5
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v4, s0
; G_GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
; G_GFX11-NEXT: ds_min_f64 v4, v[0:1]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -1113,28 +1112,28 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v5, s2
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: ds_max_f64 v5, v[0:1] offset:64
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT: s_add_i32 s1, s0, 4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_i32 s0, s2, 4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1200,22 +1199,22 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; GFX11-LABEL: lds_ds_fmax_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; GFX11-NEXT: v_mov_b32_e32 v5, s1
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
; GFX11-NEXT: ds_max_f64 v4, v[0:1] offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmax_f64:
@@ -1290,30 +1289,30 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
-; G_VI-NEXT: s_mov_b32 s2, 0
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_VI-NEXT: s_mov_b32 s0, 0
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_mov_b32 s3, 0x40450000
+; G_VI-NEXT: s_mov_b32 s1, 0x40450000
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: s_add_i32 s4, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v0, s2
-; G_VI-NEXT: s_lshl_b32 s2, s4, 3
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: v_mov_b32_e32 v0, s0
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s1
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: s_mov_b32 m0, -1
; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT: s_lshl_b32 s2, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v4, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v4, s0
; G_VI-NEXT: ds_max_f64 v4, v[0:1]
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT: v_mov_b32_e32 v2, s0
-; G_VI-NEXT: s_add_u32 s0, s0, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_add_u32 s0, s2, 4
; G_VI-NEXT: v_mov_b32_e32 v3, s0
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1386,24 +1385,25 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; G_GFX11-LABEL: lds_ds_fmax_f64:
; G_GFX11: ; %bb.0:
; G_GFX11-NEXT: s_clause 0x1
-; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; G_GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_mov_b32 s0, 0
+; G_GFX11-NEXT: s_mov_b32 s1, 0x40450000
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s4, s2, 4
-; G_GFX11-NEXT: s_mov_b32 s2, 0
-; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT: s_add_i32 s4, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v5, s3
; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT: v_mov_b32_e32 v4, s2
+; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; G_GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s5
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v4, s0
; G_GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
; G_GFX11-NEXT: ds_max_f64 v4, v[0:1]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index e1124f3..90623c0 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p
; GCN: liveins: $sgpr0_sgpr1
; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0
- ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0
; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 01a1ab4..2c3e3fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -78,12 +78,12 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
;
; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -111,12 +111,12 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -166,14 +166,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -219,12 +219,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -262,16 +262,16 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -279,35 +279,35 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -340,16 +340,16 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -357,35 +357,35 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -423,14 +423,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -476,12 +476,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -525,14 +525,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -578,12 +578,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -627,14 +627,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -680,12 +680,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -730,14 +730,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -783,12 +783,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 224de95..edd88da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -549,8 +549,8 @@ end:
; GCN-LABEL: {{^}}test_export_clustering:
; PREGFX11-DAG: v_mov_b32_e32 [[W0:v[0-9]+]], 0
; PREGFX11-DAG: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0
-; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
-; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
+; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s2
+; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s3
; PREGFX11-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
; PREGFX11-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
; PREGFX11: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index a737c5e..0567b42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -126,20 +126,20 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GISEL-GFX11-LABEL: v_fcmp_f32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f32:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
store i32 %result, ptr addrspace(1) %out
@@ -150,13 +150,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_oeq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -176,14 +176,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_oeq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -208,13 +208,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_one:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -234,14 +234,14 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_one:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -266,13 +266,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ogt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -292,14 +292,14 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ogt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -324,13 +324,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_oge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -350,14 +350,14 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_oge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -382,13 +382,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_olt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -408,14 +408,14 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_olt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -440,13 +440,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ole:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -466,14 +466,14 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ole:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -498,13 +498,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_o:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -524,14 +524,14 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_o:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -556,13 +556,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_uo:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -582,14 +582,14 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_uo:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -614,13 +614,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ueq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -640,14 +640,14 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ueq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -672,13 +672,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_une:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -698,14 +698,14 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_une:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -730,13 +730,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -756,14 +756,14 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -788,13 +788,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -814,14 +814,14 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -846,13 +846,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -872,14 +872,14 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -904,13 +904,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -930,14 +930,14 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -961,47 +961,47 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_oeq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_oeq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
store i32 %result, ptr addrspace(1) %out
@@ -1011,47 +1011,47 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_one:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_one:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_one:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_one:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
store i32 %result, ptr addrspace(1) %out
@@ -1061,47 +1061,47 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ogt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ogt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
store i32 %result, ptr addrspace(1) %out
@@ -1111,47 +1111,47 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_oge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_oge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
store i32 %result, ptr addrspace(1) %out
@@ -1161,47 +1161,47 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_olt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_olt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
store i32 %result, ptr addrspace(1) %out
@@ -1211,47 +1211,47 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ole:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ole:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
store i32 %result, ptr addrspace(1) %out
@@ -1261,47 +1261,47 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ueq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ueq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
store i32 %result, ptr addrspace(1) %out
@@ -1311,47 +1311,47 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_o:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_o:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_o:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_o:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
store i32 %result, ptr addrspace(1) %out
@@ -1361,47 +1361,47 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_uo:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_uo:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
store i32 %result, ptr addrspace(1) %out
@@ -1411,47 +1411,47 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_une:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_une:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_une:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_une:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
store i32 %result, ptr addrspace(1) %out
@@ -1461,47 +1461,47 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ugt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ugt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
store i32 %result, ptr addrspace(1) %out
@@ -1511,47 +1511,47 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_uge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_uge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
store i32 %result, ptr addrspace(1) %out
@@ -1561,47 +1561,47 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ult:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ult:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
store i32 %result, ptr addrspace(1) %out
@@ -1611,47 +1611,47 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ule:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ule:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
store i32 %result, ptr addrspace(1) %out
@@ -1663,14 +1663,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; SDAG-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3|
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0|
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1691,15 +1691,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GISEL-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3|
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0|
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1727,14 +1727,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; SDAG-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3|
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0|
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1755,15 +1755,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GISEL-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3|
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0|
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1798,20 +1798,20 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GISEL-GFX11-LABEL: v_fcmp_f16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f16:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
store i32 %result, ptr addrspace(1) %out
@@ -1823,13 +1823,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1849,14 +1849,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1882,13 +1882,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_one:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1908,14 +1908,14 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_one:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1941,13 +1941,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ogt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1967,14 +1967,14 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ogt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2000,13 +2000,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_oge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2026,14 +2026,14 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_oge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2059,13 +2059,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_olt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2085,14 +2085,14 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_olt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2118,13 +2118,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ole:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2144,14 +2144,14 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ole:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2177,13 +2177,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ueq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2203,14 +2203,14 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ueq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2236,13 +2236,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_une:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2262,14 +2262,14 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_une:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2295,13 +2295,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2321,14 +2321,14 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2354,13 +2354,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2380,14 +2380,14 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2413,13 +2413,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2439,14 +2439,14 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2471,13 +2471,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_o:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2497,14 +2497,14 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_o:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2529,13 +2529,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_uo:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2555,14 +2555,14 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_uo:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2587,13 +2587,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2613,14 +2613,14 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index 7d41cf1..62a007e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -137,10 +137,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GFX11-GISEL-LABEL: v_fcmp_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -151,10 +151,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GFX9-GISEL-LABEL: v_fcmp_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f32:
@@ -163,10 +163,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; VI-GISEL-LABEL: v_fcmp_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
@@ -178,15 +178,15 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_oeq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_eq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -206,29 +206,29 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
@@ -240,15 +240,15 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_one:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_neq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -268,29 +268,29 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
@@ -302,15 +302,15 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ogt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_lt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -330,29 +330,29 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
@@ -364,15 +364,15 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_oge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_le_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -392,29 +392,29 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
@@ -426,15 +426,15 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_olt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_gt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -454,29 +454,29 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
@@ -488,15 +488,15 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ole:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_ge_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -516,29 +516,29 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
@@ -550,15 +550,15 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_o:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -578,29 +578,29 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7)
@@ -612,15 +612,15 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_uo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_u_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -640,29 +640,29 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8)
@@ -674,15 +674,15 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ueq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -702,29 +702,29 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
@@ -736,15 +736,15 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_une:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_neq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -764,29 +764,29 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
@@ -798,15 +798,15 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nge_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -826,29 +826,29 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
@@ -860,15 +860,15 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -888,29 +888,29 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
@@ -922,15 +922,15 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -950,29 +950,29 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
@@ -984,15 +984,15 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nlt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1012,29 +1012,29 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
@@ -1045,56 +1045,56 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_oeq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_eq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_oeq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
@@ -1105,56 +1105,56 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_one:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_neq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_one:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
@@ -1165,56 +1165,56 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ogt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_lt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ogt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
@@ -1225,56 +1225,56 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_oge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_le_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_oge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
@@ -1285,56 +1285,56 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_olt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_gt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_olt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
@@ -1345,56 +1345,56 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ole:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_ge_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ole:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
@@ -1405,56 +1405,56 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ueq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nlg_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ueq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
@@ -1465,56 +1465,56 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_o:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_o_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_o:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
@@ -1525,56 +1525,56 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_uo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_u_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_uo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
@@ -1585,56 +1585,56 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_une:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_neq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_une:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
@@ -1645,56 +1645,56 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ugt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nge_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ugt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
@@ -1705,56 +1705,56 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_uge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_ngt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_uge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
@@ -1765,56 +1765,56 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ult:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nle_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ult:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
@@ -1825,56 +1825,56 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ule:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nlt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ule:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
@@ -1887,17 +1887,17 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3|
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |s0|
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1918,31 +1918,31 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
+; VI-SDAG-NEXT: s_lshr_b32 s0, s4, 16
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0|
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0|
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%temp = call half @llvm.fabs.f16(half %a)
@@ -1956,17 +1956,17 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3|
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |s0|
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1987,31 +1987,31 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
+; VI-SDAG-NEXT: s_lshr_b32 s0, s4, 16
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0|
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0|
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%temp = call half @llvm.fabs.f16(half %a)
@@ -2028,10 +2028,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GFX11-GISEL-LABEL: v_fcmp_f16:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -2042,10 +2042,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GFX9-GISEL-LABEL: v_fcmp_f16:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f16:
@@ -2054,10 +2054,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; VI-GISEL-LABEL: v_fcmp_f16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
@@ -2070,15 +2070,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_oeq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2098,29 +2098,29 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1)
@@ -2133,15 +2133,15 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_one:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_neq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2161,29 +2161,29 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6)
@@ -2196,15 +2196,15 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ogt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_lt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2224,29 +2224,29 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2)
@@ -2259,15 +2259,15 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_oge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_le_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2287,29 +2287,29 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3)
@@ -2322,15 +2322,15 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_olt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_gt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2350,29 +2350,29 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4)
@@ -2385,15 +2385,15 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ole:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_ge_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2413,29 +2413,29 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5)
@@ -2448,15 +2448,15 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ueq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nlg_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2476,29 +2476,29 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9)
@@ -2511,15 +2511,15 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_une:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_neq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2539,29 +2539,29 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14)
@@ -2574,15 +2574,15 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nge_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2602,29 +2602,29 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10)
@@ -2637,15 +2637,15 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_ngt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2665,29 +2665,29 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11)
@@ -2700,15 +2700,15 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nle_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2728,29 +2728,29 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12)
@@ -2762,15 +2762,15 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_o:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_o_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2790,29 +2790,29 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7)
@@ -2824,15 +2824,15 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_uo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_u_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2852,29 +2852,29 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8)
@@ -2886,15 +2886,15 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nlt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2914,29 +2914,29 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index ca06a57b..528d289e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -8,15 +8,15 @@ declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bf
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: global_load_u16 v1, v0, s[10:11]
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dot2_bf16_bf16 v1, s0, s1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -34,18 +34,17 @@ entry:
}
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
-; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
-; SDAG-GFX11: ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2
-; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3
-; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0
-; SDAG-GFX11-NEXT: s_endpgm
-;
+; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v0, off, s6
+; GFX11-NEXT: scratch_load_u16 v1, off, s7
+; GFX11-NEXT: scratch_load_b32 v2, off, s5
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11-NEXT: scratch_store_b16 off, v0, s4
+; GFX11-NEXT: s_endpgm
; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
; GISEL-GFX11: ; %bb.0: ; %entry
; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
@@ -95,3 +94,5 @@ entry:
}
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SDAG-GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 99c3dea..7edf3d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -7,15 +7,15 @@ declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: global_load_u16 v1, v0, s[10:11]
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dot2_f16_f16 v1, s0, s1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -35,26 +35,26 @@ entry:
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
; SDAG-GFX11: ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2
-; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3
-; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1
+; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s6
+; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s7
+; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s5
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0
+; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s4
; SDAG-GFX11-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
; GISEL-GFX11: ; %bb.0: ; %entry
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1
-; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2
-; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s3
+; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s5
+; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s6
+; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s7
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s0
+; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s4
; GISEL-GFX11-NEXT: s_endpgm
ptr addrspace(5) %r,
ptr addrspace(5) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index e51b1d2..40c6925 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -7,16 +7,16 @@ declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, floa
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[10:11], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dot2_f32_bf16 v0, s1, s2, v0 clamp
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -37,16 +37,16 @@ entry:
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[10:11], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dot2_f32_bf16 v0, s1, s2, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
index 434fa1b..690362c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -7,20 +7,20 @@ declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64)
define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) {
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:-32 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:-32 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 -4
@@ -31,14 +31,15 @@ entry:
define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn:
; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -47,13 +48,13 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index f6197e0..c2eb771 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -8,12 +8,12 @@ declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -27,12 +27,12 @@ entry:
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index a2dc366..96835c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -8,12 +8,12 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b64 v1, v0, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT: global_store_b32 v0, v1, s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -27,12 +27,12 @@ entry:
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index ae61b58..1e1ea10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -22,13 +22,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_eq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -48,14 +48,14 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_eq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -87,20 +87,20 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i32:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
store i32 %result, ptr addrspace(1) %out
@@ -111,13 +111,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ne:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -137,14 +137,14 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ne:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -169,13 +169,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -195,14 +195,14 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -227,13 +227,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -253,14 +253,14 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -285,13 +285,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -311,14 +311,14 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -343,13 +343,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -369,14 +369,14 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -401,13 +401,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; SDAG-GFX11-LABEL: v_icmp_i32_sgt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -427,14 +427,14 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; GISEL-GFX11-LABEL: v_icmp_i32_sgt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -459,13 +459,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_sge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -485,14 +485,14 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_sge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -517,13 +517,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_slt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -543,14 +543,14 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_slt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -575,13 +575,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_sle:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -601,14 +601,14 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_sle:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -632,47 +632,47 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_eq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_eq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_eq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_eq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
store i32 %result, ptr addrspace(1) %out
@@ -682,47 +682,47 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_ne:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_ne:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_ne:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_ne:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
store i32 %result, ptr addrspace(1) %out
@@ -732,47 +732,47 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ugt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ugt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
store i32 %result, ptr addrspace(1) %out
@@ -782,47 +782,47 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_uge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_uge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_uge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_uge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
store i32 %result, ptr addrspace(1) %out
@@ -832,47 +832,47 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ult:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ult:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ult:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ult:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
store i32 %result, ptr addrspace(1) %out
@@ -882,47 +882,47 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ule:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ule:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ule:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ule:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
store i32 %result, ptr addrspace(1) %out
@@ -932,47 +932,47 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sgt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sgt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
store i32 %result, ptr addrspace(1) %out
@@ -982,47 +982,47 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
store i32 %result, ptr addrspace(1) %out
@@ -1032,47 +1032,47 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_slt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_slt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_slt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_slt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
store i32 %result, ptr addrspace(1) %out
@@ -1082,47 +1082,47 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sle:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sle:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sle:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sle:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
store i32 %result, ptr addrspace(1) %out
@@ -1133,13 +1133,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_eq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1159,14 +1159,14 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_eq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1198,20 +1198,20 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i16:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
store i32 %result, ptr addrspace(1) %out
@@ -1222,13 +1222,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ne:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ne:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1280,13 +1280,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1306,14 +1306,14 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1338,13 +1338,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1364,14 +1364,14 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1396,13 +1396,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1422,14 +1422,14 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1454,13 +1454,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1480,14 +1480,14 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1512,13 +1512,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; SDAG-GFX11-LABEL: v_icmp_i16_sgt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1538,14 +1538,14 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; GISEL-GFX11-LABEL: v_icmp_i16_sgt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1570,13 +1570,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_sge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1596,14 +1596,14 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_sge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1628,13 +1628,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_slt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1654,14 +1654,14 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_slt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1686,13 +1686,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_sle:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1712,14 +1712,14 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_sle:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 54931ac..ae285c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -25,30 +25,30 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_eq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -67,15 +67,15 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
@@ -98,29 +98,29 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i32:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GISEL-VI-NEXT: s_endpgm
;
; GISEL-GFX9-LABEL: v_icmp_i32:
; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GISEL-GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
store i64 %result, ptr addrspace(1) %out
@@ -131,30 +131,30 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ne:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -173,15 +173,15 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
@@ -193,30 +193,30 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -235,15 +235,15 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
@@ -255,30 +255,30 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -297,15 +297,15 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
@@ -317,30 +317,30 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -359,15 +359,15 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
@@ -379,30 +379,30 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -421,15 +421,15 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
@@ -441,30 +441,30 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; GFX11-LABEL: v_icmp_i32_sgt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -483,15 +483,15 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
;
; GISEL-VI-LABEL: v_icmp_i32_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
@@ -503,30 +503,30 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_sge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -545,15 +545,15 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
@@ -565,30 +565,30 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_slt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -607,15 +607,15 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
@@ -627,30 +627,30 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_sle:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -669,15 +669,15 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
@@ -688,56 +688,56 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_eq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_eq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
@@ -748,56 +748,56 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_ne:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ne_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_ne:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
@@ -808,56 +808,56 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ugt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_lt_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ugt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
@@ -868,56 +868,56 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_uge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_le_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_uge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
@@ -928,56 +928,56 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ult:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_gt_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ult:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
@@ -988,56 +988,56 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ule:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ge_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ule:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
@@ -1048,56 +1048,56 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sgt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sgt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
@@ -1108,56 +1108,56 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_le_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
@@ -1168,56 +1168,56 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_slt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_gt_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_slt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
@@ -1228,56 +1228,56 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sle:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ge_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sle:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
@@ -1289,30 +1289,30 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_eq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_eq_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1331,15 +1331,15 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32)
@@ -1362,29 +1362,29 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i16:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GISEL-VI-NEXT: s_endpgm
;
; GISEL-GFX9-LABEL: v_icmp_i16:
; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GISEL-GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
store i64 %result, ptr addrspace(1) %out
@@ -1395,30 +1395,30 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ne:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ne_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1437,15 +1437,15 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33)
@@ -1457,30 +1457,30 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1499,15 +1499,15 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34)
@@ -1519,30 +1519,30 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1561,15 +1561,15 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35)
@@ -1581,30 +1581,30 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1623,15 +1623,15 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36)
@@ -1643,30 +1643,30 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1685,15 +1685,15 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37)
@@ -1705,30 +1705,30 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; GFX11-LABEL: v_icmp_i16_sgt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1747,15 +1747,15 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
;
; GISEL-VI-LABEL: v_icmp_i16_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38)
@@ -1767,30 +1767,30 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_sge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1809,15 +1809,15 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39)
@@ -1829,30 +1829,30 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_slt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1871,15 +1871,15 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40)
@@ -1891,30 +1891,30 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_sle:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1933,15 +1933,15 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 3a77b3b..cffd9a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -349,9 +349,10 @@ main_body:
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
-; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
@@ -362,22 +363,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1030: ; %bb.0: ; %main_body
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
@@ -388,36 +390,37 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX1030-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0
; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7
; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3]
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
@@ -442,9 +445,10 @@ main_body:
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
-; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
@@ -452,22 +456,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1030: ; %bb.0: ; %main_body
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
@@ -475,34 +480,35 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX1030-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0
; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102
; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 634159a..c9bdc70 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -72,13 +72,13 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
; GFX11-LABEL: v_permlane16_b32_vii:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -117,14 +117,14 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0
; GFX11-LABEL: v_permlane16_b32_vll:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_movk_i32 s0, 0x1234
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -165,19 +165,19 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
; GFX11-SDAG-LABEL: v_permlane16_b32_vvv:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -185,18 +185,18 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
; GFX11-GISEL-LABEL: v_permlane16_b32_vvv:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -596,13 +596,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src
; GFX11-LABEL: v_permlanex16_b32_vii:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -641,14 +641,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src
; GFX11-LABEL: v_permlanex16_b32_vll:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_movk_i32 s0, 0x1234
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -689,19 +689,19 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -709,18 +709,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1075,11 +1075,11 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1088,11 +1088,11 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1118,11 +1118,11 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1131,11 +1131,11 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1175,12 +1175,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -1189,13 +1189,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1204,12 +1204,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -1218,13 +1218,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -1250,11 +1250,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1263,11 +1263,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1294,11 +1294,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1307,11 +1307,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1338,11 +1338,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1351,11 +1351,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1382,11 +1382,11 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1425,11 +1425,11 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1438,11 +1438,11 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -1496,13 +1496,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1511,12 +1511,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -1525,13 +1525,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -1557,11 +1557,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1570,11 +1570,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1601,11 +1601,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1614,11 +1614,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1645,11 +1645,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out,
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index 77a975f..2cc49c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -445,13 +445,13 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -459,14 +459,14 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -480,13 +480,13 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -494,14 +494,14 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -516,14 +516,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -531,14 +531,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -552,13 +552,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -566,14 +566,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -588,13 +588,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -602,14 +602,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -624,13 +624,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -638,14 +638,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -660,13 +660,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -674,14 +674,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -695,13 +695,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -709,14 +709,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -731,14 +731,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -746,14 +746,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -767,13 +767,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -781,14 +781,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -803,13 +803,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -817,14 +817,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -839,13 +839,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -853,14 +853,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index b81cb97..84edbb8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -9,13 +9,13 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
; GFX11-LABEL: test_s:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -27,12 +27,12 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
; GFX11-LABEL: test_i:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -44,22 +44,22 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-SDAG-LABEL: test_v:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_v:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
index cb511c9..bf3d0a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
@@ -8,11 +8,11 @@
define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
@@ -37,11 +37,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat
; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -53,14 +53,14 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -85,11 +85,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal
; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -101,16 +101,16 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s1, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
@@ -139,13 +139,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -158,18 +158,18 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s3, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s3, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s3
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -196,12 +196,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index 01df7634..2be7ec2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -10,11 +10,11 @@
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
@@ -39,11 +39,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -51,11 +51,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; GFX12-PACKED-LABEL: tbuffer_store_d16_x:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
@@ -67,14 +67,14 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -99,11 +99,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -111,11 +111,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
; GFX12-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
@@ -127,16 +127,16 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s1, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
@@ -165,13 +165,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -179,13 +179,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz:
; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body
; GFX12-PACKED-SDAG-NEXT: s_clause 0x1
-; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-SDAG-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-SDAG-NEXT: s_nop 0
; GFX12-PACKED-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-SDAG-NEXT: s_endpgm
@@ -193,14 +193,14 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz:
; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body
; GFX12-PACKED-GISEL-NEXT: s_clause 0x1
-; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4
+; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s2, s2, s2
; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-GISEL-NEXT: s_nop 0
; GFX12-PACKED-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-GISEL-NEXT: s_endpgm
@@ -213,18 +213,18 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s3, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s3, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s3
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -264,12 +264,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f52461b6..2dc346a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
;
; GFX8GISEL-LABEL: uniform_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -83,12 +83,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-LABEL: uniform_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -96,12 +96,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-LABEL: uniform_value:
; GFX1164GISEL: ; %bb.0: ; %entry
; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -109,11 +109,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-LABEL: uniform_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -121,11 +121,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-LABEL: uniform_value:
; GFX1132GISEL: ; %bb.0: ; %entry
; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -138,98 +138,98 @@ entry:
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: const_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: const_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: const_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX10DAGISEL-LABEL: const_value:
; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: const_value:
; GFX10GISEL: ; %bb.0: ; %entry
-; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: const_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: const_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: const_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: const_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -280,241 +280,241 @@ entry:
define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: divergent_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: divergent_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, 0
; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: divergent_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, 0
; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_value:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_value:
; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_value:
; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032DAGISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_value:
; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: divergent_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132DAGISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: divergent_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -556,10 +556,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
; GFX8DAGISEL-NEXT: s_endpgm
;
@@ -590,11 +590,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX8GISEL-NEXT: .LBB4_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -628,10 +628,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_cfg:
@@ -661,11 +661,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX9GISEL-NEXT: .LBB4_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_cfg:
@@ -698,10 +698,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_cfg:
@@ -731,11 +731,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_cfg:
@@ -768,10 +768,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_cfg:
@@ -801,11 +801,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_cfg:
@@ -839,10 +839,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -875,11 +875,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -915,10 +915,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -951,10 +951,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index bfdb2da..bfae6f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -30,12 +30,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
;
; GFX8GISEL-LABEL: uniform_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -84,12 +84,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-LABEL: uniform_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -97,12 +97,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-LABEL: uniform_value:
; GFX1164GISEL: ; %bb.0: ; %entry
; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -110,11 +110,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-LABEL: uniform_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -122,11 +122,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-LABEL: uniform_value:
; GFX1132GISEL: ; %bb.0: ; %entry
; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -139,98 +139,98 @@ entry:
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: const_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: const_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: const_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX10DAGISEL-LABEL: const_value:
; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: const_value:
; GFX10GISEL: ; %bb.0: ; %entry
-; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: const_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: const_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: const_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: const_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -281,241 +281,241 @@ entry:
define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: divergent_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: divergent_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, -1
; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: divergent_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, -1
; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_value:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_value:
; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, -1
; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_value:
; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032DAGISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_value:
; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, -1
; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, -1
; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: divergent_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132DAGISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: divergent_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -557,10 +557,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
; GFX8DAGISEL-NEXT: s_endpgm
;
@@ -591,11 +591,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX8GISEL-NEXT: .LBB4_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -629,10 +629,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_cfg:
@@ -662,11 +662,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX9GISEL-NEXT: .LBB4_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_cfg:
@@ -699,10 +699,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_cfg:
@@ -732,11 +732,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_cfg:
@@ -769,10 +769,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_cfg:
@@ -802,11 +802,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_cfg:
@@ -840,10 +840,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -876,11 +876,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -916,10 +916,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -952,10 +952,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 3eb2261..e034076 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -5,34 +5,34 @@
define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -51,34 +51,34 @@ entry:
define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 1
; GCN-NEXT: s_barrier_wait 1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 1
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -97,34 +97,34 @@ entry:
define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 0
; GCN-NEXT: s_barrier_wait 0
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 0
; GLOBAL-ISEL-NEXT: s_barrier_wait 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -143,7 +143,7 @@ entry:
define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_var:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
@@ -151,29 +151,29 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v1, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal m0
; GCN-NEXT: s_barrier_wait 1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -222,43 +222,43 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst -1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -278,43 +278,43 @@ entry:
define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -334,43 +334,43 @@ entry:
define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -390,45 +390,45 @@ entry:
define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_isfirst_var:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_mov_b32 m0, 1
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst m0
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -732,29 +732,29 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join -1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -772,29 +772,29 @@ entry:
define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join 1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -812,29 +812,29 @@ entry:
define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -924,41 +924,41 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_leave:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_barrier_leave
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_leave:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_barrier_leave
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -978,29 +978,29 @@ entry:
define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier -1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1018,29 +1018,29 @@ entry:
define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier 1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1058,29 +1058,29 @@ entry:
define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1170,27 +1170,27 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, -1
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, -1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, -1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1206,27 +1206,27 @@ entry:
define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, 1
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, 1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, 1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1242,27 +1242,27 @@ entry:
define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, 0
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, 0
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, 0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1352,34 +1352,34 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test_barrier_convert:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test_barrier_convert:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index eb30484..3883b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -5,22 +5,22 @@
define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_doorbell:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_doorbell:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -32,22 +32,22 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_ddid:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DDID)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_ddid:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DDID)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -59,12 +59,12 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_tma:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_TMA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -76,12 +76,12 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_realtime:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_REALTIME)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -93,22 +93,22 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_savewave:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_savewave:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -120,12 +120,12 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_tba:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_TBA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -137,22 +137,22 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_0_i32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(0, 0, 0)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_0_i32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(0, 0, 0)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -164,12 +164,12 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_99999_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], 99999
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8f8994e..2c5efd3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,16 +5,16 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -24,12 +24,12 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -39,19 +39,19 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -61,13 +61,13 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -81,30 +81,30 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s3, 56
-; GCN-NEXT: s_mov_b64 s[2:3], -1
+; GCN-NEXT: s_mov_b64 s[0:1], -1
; GCN-NEXT: s_cbranch_scc1 .LBB4_3
; GCN-NEXT: ; %bb.1: ; %Flow
-; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GCN-NEXT: s_cbranch_vccz .LBB4_4
; GCN-NEXT: .LBB4_2: ; %.exit
; GCN-NEXT: s_endpgm
; GCN-NEXT: .LBB4_3: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0
; GCN-NEXT: s_cbranch_execnz .LBB4_2
; GCN-NEXT: .LBB4_4: ; %.zero
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%cmp = icmp eq i32 %val, 56
@@ -127,17 +127,17 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
store float %tmp, ptr addrspace(1) %out
@@ -147,21 +147,21 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
store double %tmp, ptr addrspace(1) %out
@@ -171,17 +171,17 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x10001
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
store <2 x i16> %tmp, ptr addrspace(1) %out
@@ -191,17 +191,17 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x3c003c00
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
store <2 x half> %tmp, ptr addrspace(1) %out
@@ -259,17 +259,17 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x3f803f80
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
store <2 x bfloat> %tmp, ptr addrspace(1) %out
@@ -351,19 +351,19 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
store ptr %tmp, ptr addrspace(1) %out
@@ -373,16 +373,16 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
@@ -392,16 +392,16 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
@@ -411,16 +411,16 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
@@ -430,16 +430,16 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 87c5f5b..7bcafea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -147,12 +147,12 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out,
;
; VI-LABEL: bfe_u32_arg_0_width_reg_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -172,12 +172,12 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out,
;
; VI-LABEL: bfe_u32_arg_0_width_imm_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -204,19 +204,19 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp
;
; VI-LABEL: bfe_u32_zextload_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in
%ext = zext i8 %load to i32
@@ -248,21 +248,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: bfe_u32_zext_in_reg_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -294,21 +294,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: bfe_u32_zext_in_reg_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -341,22 +341,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xfe, v0
; VI-NEXT: v_bfe_u32 v0, v0, 1, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -389,22 +389,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xf8, v0
; VI-NEXT: v_bfe_u32 v0, v0, 3, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -437,22 +437,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0x80, v0
; VI-NEXT: v_bfe_u32 v0, v0, 7, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -484,21 +484,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou
;
; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -529,20 +529,20 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
@@ -563,12 +563,12 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -590,12 +590,12 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -617,12 +617,12 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -653,20 +653,20 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_5:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bfe_i32 v0, v0, 0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -698,21 +698,21 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
; VI-NEXT: v_and_b32_e32 v0, 2.0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -742,20 +742,20 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -785,20 +785,20 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -828,20 +828,20 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_9:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
@@ -870,20 +870,20 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
@@ -912,20 +912,20 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_11:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
@@ -954,20 +954,20 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_12:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
@@ -997,20 +997,20 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_13:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = ashr i32 %x, 31
@@ -1031,12 +1031,12 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_14:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = lshr i32 %x, 31
@@ -1057,12 +1057,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1082,12 +1082,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1107,12 +1107,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1132,12 +1132,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1157,12 +1157,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1182,12 +1182,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_5:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1207,12 +1207,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x80
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1232,12 +1232,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1257,12 +1257,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1282,12 +1282,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_9:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1307,12 +1307,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1332,12 +1332,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_11:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1357,12 +1357,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_12:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1382,12 +1382,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_13:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1407,12 +1407,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_14:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 40
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1432,12 +1432,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_15:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1457,12 +1457,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_17:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1507,12 +1507,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_18:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1593,14 +1593,14 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: lshr_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = lshr i32 %a, 6
%c = and i32 %b, 7
@@ -1657,14 +1657,14 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: and_lshr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = and i32 %a, 448
%c = lshr i32 %b, 6
@@ -1687,14 +1687,14 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: and_lshr2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = and i32 %a, 511
%c = lshr i32 %b, 6
@@ -1717,14 +1717,14 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: shl_lshr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x150002
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x150002
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = shl i32 %a, 9
%c = lshr i32 %b, 11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index eeddb3d..7edac87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -30,58 +30,58 @@ define amdgpu_kernel void @ceil_f16(
;
; VI-LABEL: ceil_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ceil_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ceil_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: ceil_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
@@ -130,37 +130,37 @@ define amdgpu_kernel void @ceil_v2f16(
;
; VI-LABEL: ceil_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ceil_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_ceil_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ceil_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
@@ -172,31 +172,31 @@ define amdgpu_kernel void @ceil_v2f16(
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: ceil_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index fcc4cb3..28d3e8f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -30,55 +30,55 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: cos_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_cos_f16_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: cos_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX9-NEXT: v_cos_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cos_f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX10-NEXT: v_cos_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cos_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cos_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -121,10 +121,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: cos_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -134,50 +134,50 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cos_f16_e32 v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: cos_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cos_f16_e32 v2, v3
; GFX9-NEXT: v_cos_f16_e32 v1, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cos_v2f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_cos_f16_e32 v2, v3
; GFX10-NEXT: v_cos_f16_e32 v1, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cos_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
@@ -188,7 +188,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_cos_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4f65acd..d60e07d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -12,84 +12,86 @@
define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-LABEL: s_exp_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295
; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
@@ -97,39 +99,39 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_f32:
; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_f32:
@@ -853,7 +855,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -870,16 +871,17 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
@@ -891,17 +893,17 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v0
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v6
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -915,9 +917,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -926,7 +929,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -936,14 +938,15 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8a000, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
@@ -952,19 +955,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0
; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
@@ -987,19 +990,20 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6
@@ -1043,15 +1047,15 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_v3f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5
@@ -1096,7 +1100,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_v3f32:
@@ -1593,7 +1597,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -1610,37 +1613,38 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v10, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9
@@ -1649,17 +1653,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v0
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v9
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -1673,9 +1677,10 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1685,7 +1690,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -1701,49 +1705,50 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v2
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0
; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8
; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9
; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6
-; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3
+; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3
; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8
-; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8
; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8
; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000
@@ -1764,19 +1769,20 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
@@ -1787,8 +1793,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3
@@ -1833,17 +1839,16 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_v4f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2
; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0
@@ -1900,7 +1905,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index ff20f90..bd167dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -14,84 +14,86 @@
define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-LABEL: s_exp10_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc
; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp10_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
@@ -99,39 +101,39 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_f32:
; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_f32:
@@ -855,7 +857,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -872,16 +873,17 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
@@ -893,17 +895,17 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v0
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v6
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -917,9 +919,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -928,7 +931,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -938,14 +940,15 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x40549000, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
@@ -954,19 +957,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0
; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
@@ -989,19 +992,20 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6
@@ -1045,15 +1049,15 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_v3f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5
@@ -1098,7 +1102,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_v3f32:
@@ -1595,7 +1599,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -1612,37 +1615,38 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v10, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9
@@ -1651,17 +1655,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v0
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v9
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -1675,9 +1679,10 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1687,7 +1692,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -1703,49 +1707,50 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v2
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4
; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8
; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9
; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6
-; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3
+; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3
; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8
-; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x40549000, v8
; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8
; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000
@@ -1766,19 +1771,20 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
@@ -1789,8 +1795,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3
@@ -1835,17 +1841,16 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_v4f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2
; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0
@@ -1902,7 +1907,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 06fa910..197aa073 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -50,39 +50,39 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_exp2_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp2_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -109,17 +109,18 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_exp2_f32:
@@ -445,7 +446,7 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_exp2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
@@ -467,9 +468,9 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -730,7 +731,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_exp2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
@@ -757,10 +758,10 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_exp_f32_e32 v9, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index e8d037c..fca0398 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -30,58 +30,58 @@ define amdgpu_kernel void @floor_f16(
;
; VI-LABEL: floor_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_floor_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: floor_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: floor_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
@@ -131,37 +131,37 @@ define amdgpu_kernel void @floor_v2f16(
;
; VI-LABEL: floor_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_floor_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_floor_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: floor_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
@@ -173,31 +173,31 @@ define amdgpu_kernel void @floor_v2f16(
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: floor_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index a2e3060..038ad95 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -48,177 +48,177 @@ define amdgpu_kernel void @fmuladd_f16(
;
; VI-FLUSH-LABEL: fmuladd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
-; VI-FLUSH-NEXT: s_mov_b32 s10, -1
-; VI-FLUSH-NEXT: s_mov_b32 s14, s10
-; VI-FLUSH-NEXT: s_mov_b32 s15, s11
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT: s_mov_b32 s2, -1
+; VI-FLUSH-NEXT: s_mov_b32 s14, s2
+; VI-FLUSH-NEXT: s_mov_b32 s15, s3
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: s_mov_b32 s12, s2
-; VI-FLUSH-NEXT: s_mov_b32 s13, s3
-; VI-FLUSH-NEXT: s_mov_b32 s16, s4
-; VI-FLUSH-NEXT: s_mov_b32 s17, s5
-; VI-FLUSH-NEXT: s_mov_b32 s18, s10
-; VI-FLUSH-NEXT: s_mov_b32 s19, s11
-; VI-FLUSH-NEXT: s_mov_b32 s4, s6
-; VI-FLUSH-NEXT: s_mov_b32 s5, s7
-; VI-FLUSH-NEXT: s_mov_b32 s6, s10
-; VI-FLUSH-NEXT: s_mov_b32 s7, s11
+; VI-FLUSH-NEXT: s_mov_b32 s12, s6
+; VI-FLUSH-NEXT: s_mov_b32 s13, s7
+; VI-FLUSH-NEXT: s_mov_b32 s16, s8
+; VI-FLUSH-NEXT: s_mov_b32 s17, s9
+; VI-FLUSH-NEXT: s_mov_b32 s18, s2
+; VI-FLUSH-NEXT: s_mov_b32 s19, s3
+; VI-FLUSH-NEXT: s_mov_b32 s8, s10
+; VI-FLUSH-NEXT: s_mov_b32 s9, s11
+; VI-FLUSH-NEXT: s_mov_b32 s10, s2
+; VI-FLUSH-NEXT: s_mov_b32 s11, s3
; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; VI-FLUSH-NEXT: s_mov_b32 s8, s0
-; VI-FLUSH-NEXT: s_mov_b32 s9, s1
+; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; VI-FLUSH-NEXT: s_mov_b32 s0, s4
+; VI-FLUSH-NEXT: s_mov_b32 s1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v2, v0, v1
-; VI-FLUSH-NEXT: buffer_store_short v2, off, s[8:11], 0
+; VI-FLUSH-NEXT: buffer_store_short v2, off, s[0:3], 0
; VI-FLUSH-NEXT: s_endpgm
;
; VI-DENORM-LABEL: fmuladd_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
-; VI-DENORM-NEXT: s_mov_b32 s10, -1
-; VI-DENORM-NEXT: s_mov_b32 s14, s10
-; VI-DENORM-NEXT: s_mov_b32 s15, s11
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT: s_mov_b32 s2, -1
+; VI-DENORM-NEXT: s_mov_b32 s14, s2
+; VI-DENORM-NEXT: s_mov_b32 s15, s3
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: s_mov_b32 s12, s2
-; VI-DENORM-NEXT: s_mov_b32 s13, s3
-; VI-DENORM-NEXT: s_mov_b32 s16, s4
-; VI-DENORM-NEXT: s_mov_b32 s17, s5
-; VI-DENORM-NEXT: s_mov_b32 s18, s10
-; VI-DENORM-NEXT: s_mov_b32 s19, s11
-; VI-DENORM-NEXT: s_mov_b32 s4, s6
-; VI-DENORM-NEXT: s_mov_b32 s5, s7
-; VI-DENORM-NEXT: s_mov_b32 s6, s10
-; VI-DENORM-NEXT: s_mov_b32 s7, s11
+; VI-DENORM-NEXT: s_mov_b32 s12, s6
+; VI-DENORM-NEXT: s_mov_b32 s13, s7
+; VI-DENORM-NEXT: s_mov_b32 s16, s8
+; VI-DENORM-NEXT: s_mov_b32 s17, s9
+; VI-DENORM-NEXT: s_mov_b32 s18, s2
+; VI-DENORM-NEXT: s_mov_b32 s19, s3
+; VI-DENORM-NEXT: s_mov_b32 s8, s10
+; VI-DENORM-NEXT: s_mov_b32 s9, s11
+; VI-DENORM-NEXT: s_mov_b32 s10, s2
+; VI-DENORM-NEXT: s_mov_b32 s11, s3
; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; VI-DENORM-NEXT: s_mov_b32 s8, s0
-; VI-DENORM-NEXT: s_mov_b32 s9, s1
+; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; VI-DENORM-NEXT: s_mov_b32 s0, s4
+; VI-DENORM-NEXT: s_mov_b32 s1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v0, v0, v1, v2
-; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-DENORM-NEXT: s_endpgm
;
; GFX10-FLUSH-LABEL: fmuladd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT: s_mov_b32 s21, s11
; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[8:11], 0
+; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[0:3], 0
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX11-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[16:19], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[4:7], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX11-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s23, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX11-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX11-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX11-DENORM-NEXT: s_mov_b32 s21, s11
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[16:19], 0
; GFX11-DENORM-NEXT: buffer_load_u16 v2, off, s[20:23], 0
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -361,26 +361,26 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; GFX11-FLUSH: ; %bb.0:
; GFX11-FLUSH-NEXT: s_clause 0x1
; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
@@ -389,24 +389,24 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; GFX11-DENORM: ; %bb.0:
; GFX11-DENORM-NEXT: s_clause 0x1
; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s10, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s11, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -547,26 +547,26 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; GFX11-FLUSH: ; %bb.0:
; GFX11-FLUSH-NEXT: s_clause 0x1
; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
@@ -575,24 +575,24 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; GFX11-DENORM: ; %bb.0:
; GFX11-DENORM-NEXT: s_clause 0x1
; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s10, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s11, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -653,27 +653,27 @@ define amdgpu_kernel void @fmuladd_v2f16(
;
; VI-FLUSH-LABEL: fmuladd_v2f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
-; VI-FLUSH-NEXT: s_mov_b32 s10, -1
-; VI-FLUSH-NEXT: s_mov_b32 s14, s10
-; VI-FLUSH-NEXT: s_mov_b32 s15, s11
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT: s_mov_b32 s2, -1
+; VI-FLUSH-NEXT: s_mov_b32 s14, s2
+; VI-FLUSH-NEXT: s_mov_b32 s15, s3
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: s_mov_b32 s12, s2
-; VI-FLUSH-NEXT: s_mov_b32 s13, s3
-; VI-FLUSH-NEXT: s_mov_b32 s16, s4
-; VI-FLUSH-NEXT: s_mov_b32 s17, s5
-; VI-FLUSH-NEXT: s_mov_b32 s18, s10
-; VI-FLUSH-NEXT: s_mov_b32 s19, s11
-; VI-FLUSH-NEXT: s_mov_b32 s4, s6
-; VI-FLUSH-NEXT: s_mov_b32 s5, s7
-; VI-FLUSH-NEXT: s_mov_b32 s6, s10
-; VI-FLUSH-NEXT: s_mov_b32 s7, s11
+; VI-FLUSH-NEXT: s_mov_b32 s12, s6
+; VI-FLUSH-NEXT: s_mov_b32 s13, s7
+; VI-FLUSH-NEXT: s_mov_b32 s16, s8
+; VI-FLUSH-NEXT: s_mov_b32 s17, s9
+; VI-FLUSH-NEXT: s_mov_b32 s18, s2
+; VI-FLUSH-NEXT: s_mov_b32 s19, s3
+; VI-FLUSH-NEXT: s_mov_b32 s8, s10
+; VI-FLUSH-NEXT: s_mov_b32 s9, s11
+; VI-FLUSH-NEXT: s_mov_b32 s10, s2
+; VI-FLUSH-NEXT: s_mov_b32 s11, s3
; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[8:11], 0
; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0
-; VI-FLUSH-NEXT: s_mov_b32 s8, s0
-; VI-FLUSH-NEXT: s_mov_b32 s9, s1
+; VI-FLUSH-NEXT: s_mov_b32 s0, s4
+; VI-FLUSH-NEXT: s_mov_b32 s1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(1)
; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
@@ -681,32 +681,32 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2
; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3
-; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-FLUSH-NEXT: s_endpgm
;
; VI-DENORM-LABEL: fmuladd_v2f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
-; VI-DENORM-NEXT: s_mov_b32 s10, -1
-; VI-DENORM-NEXT: s_mov_b32 s14, s10
-; VI-DENORM-NEXT: s_mov_b32 s15, s11
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT: s_mov_b32 s2, -1
+; VI-DENORM-NEXT: s_mov_b32 s14, s2
+; VI-DENORM-NEXT: s_mov_b32 s15, s3
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: s_mov_b32 s16, s4
-; VI-DENORM-NEXT: s_mov_b32 s17, s5
-; VI-DENORM-NEXT: s_mov_b32 s4, s6
-; VI-DENORM-NEXT: s_mov_b32 s5, s7
-; VI-DENORM-NEXT: s_mov_b32 s6, s10
-; VI-DENORM-NEXT: s_mov_b32 s7, s11
-; VI-DENORM-NEXT: s_mov_b32 s12, s2
-; VI-DENORM-NEXT: s_mov_b32 s13, s3
-; VI-DENORM-NEXT: s_mov_b32 s18, s10
-; VI-DENORM-NEXT: s_mov_b32 s19, s11
-; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-DENORM-NEXT: s_mov_b32 s16, s8
+; VI-DENORM-NEXT: s_mov_b32 s17, s9
+; VI-DENORM-NEXT: s_mov_b32 s8, s10
+; VI-DENORM-NEXT: s_mov_b32 s9, s11
+; VI-DENORM-NEXT: s_mov_b32 s10, s2
+; VI-DENORM-NEXT: s_mov_b32 s11, s3
+; VI-DENORM-NEXT: s_mov_b32 s12, s6
+; VI-DENORM-NEXT: s_mov_b32 s13, s7
+; VI-DENORM-NEXT: s_mov_b32 s18, s2
+; VI-DENORM-NEXT: s_mov_b32 s19, s3
+; VI-DENORM-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0
-; VI-DENORM-NEXT: s_mov_b32 s8, s0
-; VI-DENORM-NEXT: s_mov_b32 s9, s1
+; VI-DENORM-NEXT: s_mov_b32 s0, s4
+; VI-DENORM-NEXT: s_mov_b32 s1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(2)
; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-DENORM-NEXT: s_waitcnt vmcnt(1)
@@ -717,126 +717,126 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0
; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-DENORM-NEXT: s_endpgm
;
; GFX10-FLUSH-LABEL: fmuladd_v2f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_v2f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT: s_mov_b32 s21, s11
; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[12:15], 0
; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_v2f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX11-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[4:7], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX11-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_v2f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s23, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX11-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX11-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX11-DENORM-NEXT: s_mov_b32 s21, s11
; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
index aca7d3c..df4d3fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
@@ -107,46 +107,46 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) {
;
; GFX8-LABEL: kernel_fpmode_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 19)
-; GFX8-NEXT: s_and_b32 s2, 0x7f3ff, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 19)
+; GFX8-NEXT: s_and_b32 s0, 0x7f3ff, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: kernel_fpmode_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
-; GFX9-NEXT: s_and_b32 s2, 0x87f3ff, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
+; GFX9-NEXT: s_and_b32 s0, 0x87f3ff, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: kernel_fpmode_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_and_b32 s2, 0x87f3ff, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: s_and_b32 s0, 0x87f3ff, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: kernel_fpmode_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
-; GFX11-NEXT: s_and_b32 s2, 0x87f3ff, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
+; GFX11-NEXT: s_and_b32 s0, 0x87f3ff, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index ea823f3..1f62bcc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -28,16 +28,16 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_bf16:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX8CHECK-NEXT: s_movk_i32 s3, 0x7f80
+; GFX8CHECK-NEXT: s_movk_i32 s0, 0x7f80
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_and_b32_e32 v0, s2, v0
-; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s3, v0
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
+; GFX8CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s0, v0
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -58,27 +58,27 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
; GFX10CHECK-LABEL: sgpr_isnan_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v1, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2
+; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX10CHECK-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10CHECK-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v1, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2
+; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index da64c37..26c426a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -43,13 +43,13 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_f16:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[0:1], s4, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -67,25 +67,25 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX10CHECK-LABEL: sgpr_isnan_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s0, s4, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, s4, 3
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index 347e549..c7e7e7b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -37,13 +37,13 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_f32:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -61,26 +61,26 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
; GFX10CHECK-LABEL: sgpr_isnan_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s0, s4, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
+; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, s4, 3
; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
@@ -115,57 +115,46 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7GLISEL-NEXT: s_endpgm
;
-; GFX8SELDAG-LABEL: sgpr_isnan_f64:
-; GFX8SELDAG: ; %bb.0:
-; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX8SELDAG-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 3
-; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2
-; GFX8SELDAG-NEXT: s_endpgm
-;
-; GFX8GLISEL-LABEL: sgpr_isnan_f64:
-; GFX8GLISEL: ; %bb.0:
-; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GLISEL-NEXT: s_endpgm
+; GFX8CHECK-LABEL: sgpr_isnan_f64:
+; GFX8CHECK: ; %bb.0:
+; GFX8CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s4
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s5
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
+; GFX8CHECK-NEXT: s_endpgm
;
; GFX9CHECK-LABEL: sgpr_isnan_f64:
; GFX9CHECK: ; %bb.0:
-; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
-; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3
+; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9CHECK-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9CHECK-NEXT: s_endpgm
;
; GFX10CHECK-LABEL: sgpr_isnan_f64:
; GFX10CHECK: ; %bb.0:
-; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s0, s[6:7], 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f64:
; GFX11CHECK: ; %bb.0:
-; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
+; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s0, s[6:7], 3
; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
@@ -1469,3 +1458,6 @@ declare <7 x i1> @llvm.is.fpclass.v7f32(<7 x float>, i32)
declare <8 x i1> @llvm.is.fpclass.v8f32(<8 x float>, i32)
declare <16 x i1> @llvm.is.fpclass.v16f32(<16 x float>, i32)
declare <2 x i1> @llvm.is.fpclass.v2f64(<2 x double>, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX8GLISEL: {{.*}}
+; GFX8SELDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index ad70589..6f1d374 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -188,14 +188,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
;
; GFX1100-SDAG-LABEL: s_log_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -207,23 +206,23 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
@@ -233,11 +232,12 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1090,18 +1090,18 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1137,7 +1137,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
-; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1146,18 +1146,18 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
@@ -1178,7 +1178,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1193,7 +1193,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1775,31 +1775,31 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
@@ -1827,7 +1827,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1836,31 +1836,31 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
@@ -1889,7 +1889,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 82c73fa..e8671f5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -188,14 +188,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
;
; GFX1100-SDAG-LABEL: s_log10_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -207,23 +206,23 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log10_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
@@ -233,11 +232,12 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1090,18 +1090,18 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1137,7 +1137,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
-; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1146,18 +1146,18 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
@@ -1178,7 +1178,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1193,7 +1193,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1775,31 +1775,31 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
@@ -1827,7 +1827,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1836,31 +1836,31 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
@@ -1889,7 +1889,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index b76e621..88b5e61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -52,39 +52,39 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_log2_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log2_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -111,56 +111,57 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; GFX1100-SDAG-LABEL: s_log2_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
-; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log2_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -537,7 +538,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_log2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
@@ -559,9 +560,9 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_log_f32_e32 v6, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -658,9 +659,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v3f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
@@ -668,23 +667,25 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3
-; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -693,20 +694,20 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
@@ -717,7 +718,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -888,7 +889,7 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_log2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
@@ -915,10 +916,10 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_log_f32_e32 v9, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1033,9 +1034,7 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v4f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
@@ -1048,23 +1047,24 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6
; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
-; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1073,36 +1073,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index d056a97..b8065d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -117,27 +117,27 @@ define amdgpu_kernel void @maxnum_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -175,79 +175,79 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
;
; VI-LABEL: maxnum_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -283,79 +283,79 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
;
; VI-LABEL: maxnum_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -396,26 +396,26 @@ define amdgpu_kernel void @maxnum_v2f16(
;
; VI-LABEL: maxnum_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[4:5], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s8, s[8:9], 0x0
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16:
@@ -456,19 +456,19 @@ define amdgpu_kernel void @maxnum_v2f16(
; GFX11-LABEL: maxnum_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -506,61 +506,61 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
;
; VI-LABEL: maxnum_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v2f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -596,61 +596,61 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
;
; VI-LABEL: maxnum_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4200
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v2f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -697,30 +697,30 @@ define amdgpu_kernel void @maxnum_v3f16(
;
; VI-LABEL: maxnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_max_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v3f16:
@@ -769,24 +769,24 @@ define amdgpu_kernel void @maxnum_v3f16(
; GFX11-LABEL: maxnum_v3f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_max_f16 v2, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v0, v3, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -843,35 +843,35 @@ define amdgpu_kernel void @maxnum_v4f16(
;
; VI-LABEL: maxnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: s_lshr_b32 s4, s9, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v4f16:
@@ -918,22 +918,22 @@ define amdgpu_kernel void @maxnum_v4f16(
; GFX11-LABEL: maxnum_v4f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
; GFX11-NEXT: v_pk_max_f16 v1, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v0, v3, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -980,79 +980,79 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
;
; VI-LABEL: fmax_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4400
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s4, s6, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmax_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s8, 0x44004200
; GFX9-NEXT: s_mov_b32 s9, 0x40004800
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT: v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fmax_v4f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0
; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fmax_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0
; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index f934a2d..a78fc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -117,26 +117,26 @@ define amdgpu_kernel void @minnum_f16_ieee(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -202,78 +202,78 @@ define amdgpu_kernel void @minnum_f16_imm_a(
;
; VI-LABEL: minnum_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,78 +309,78 @@ define amdgpu_kernel void @minnum_f16_imm_b(
;
; VI-LABEL: minnum_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -421,26 +421,26 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
;
; VI-LABEL: minnum_v2f16_ieee:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[4:5], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s8, s[8:9], 0x0
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_ieee:
@@ -481,18 +481,18 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; GFX11-LABEL: minnum_v2f16_ieee:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -565,60 +565,60 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
;
; VI-LABEL: minnum_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v2f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -654,60 +654,60 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
;
; VI-LABEL: minnum_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4200
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v2f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -754,30 +754,30 @@ define amdgpu_kernel void @minnum_v3f16(
;
; VI-LABEL: minnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_min_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v3f16:
@@ -826,23 +826,23 @@ define amdgpu_kernel void @minnum_v3f16(
; GFX11-LABEL: minnum_v3f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_max_f16 v2, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_pk_min_f16 v1, v2, v1
; GFX11-NEXT: v_pk_min_f16 v0, v3, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -899,35 +899,35 @@ define amdgpu_kernel void @minnum_v4f16(
;
; VI-LABEL: minnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: s_lshr_b32 s4, s9, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v4f16:
@@ -974,21 +974,21 @@ define amdgpu_kernel void @minnum_v4f16(
; GFX11-LABEL: minnum_v4f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
; GFX11-NEXT: v_pk_max_f16 v1, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_pk_min_f16 v1, v1, v0
; GFX11-NEXT: v_pk_min_f16 v0, v3, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1035,78 +1035,78 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
;
; VI-LABEL: fmin_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4400
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s4, s6, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmin_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s8, 0x44004200
; GFX9-NEXT: s_mov_b32 s9, 0x40004800
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT: v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fmin_v4f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0
; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fmin_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0
; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index c3e665f..1423575 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -365,57 +365,57 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX9-LABEL: umulo_i64_s:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s0, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT: s_add_u32 s9, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s1, s2
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT: s_add_u32 s9, s9, s6
-; GFX9-NEXT: s_mul_hi_u32 s10, s1, s3
-; GFX9-NEXT: s_addc_u32 s4, s5, s4
-; GFX9-NEXT: s_addc_u32 s5, s10, 0
-; GFX9-NEXT: s_mul_i32 s1, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s1
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_i32 s1, s8, s7
-; GFX9-NEXT: s_add_i32 s1, s1, s6
-; GFX9-NEXT: s_mul_i32 s0, s0, s2
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_cselect_b32 s1, 0, s1
-; GFX9-NEXT: s_cselect_b32 s0, 0, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mul_i32 s3, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT: s_add_u32 s9, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s5, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT: s_add_u32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s10, s5, s7
+; GFX9-NEXT: s_addc_u32 s0, s1, s0
+; GFX9-NEXT: s_addc_u32 s1, s10, 0
+; GFX9-NEXT: s_mul_i32 s5, s5, s7
+; GFX9-NEXT: s_add_u32 s0, s0, s5
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_add_i32 s3, s8, s3
+; GFX9-NEXT: s_add_i32 s3, s3, s2
+; GFX9-NEXT: s_mul_i32 s2, s4, s6
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 0, s3
+; GFX9-NEXT: s_cselect_b32 s1, 0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: umulo_i64_s:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s7, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
-; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3
-; GFX10-NEXT: s_mul_i32 s1, s1, s3
-; GFX10-NEXT: s_add_u32 s3, s8, s7
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_u32 s3, s3, s6
-; GFX10-NEXT: s_addc_u32 s3, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s3, s1
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_mul_i32 s3, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT: s_mul_i32 s2, s5, s6
+; GFX10-NEXT: s_mul_hi_u32 s9, s5, s7
+; GFX10-NEXT: s_mul_i32 s5, s5, s7
+; GFX10-NEXT: s_add_u32 s7, s8, s3
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_add_u32 s7, s7, s2
+; GFX10-NEXT: s_addc_u32 s0, s1, s0
+; GFX10-NEXT: s_addc_u32 s1, s9, 0
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_add_i32 s3, s8, s3
+; GFX10-NEXT: s_mul_i32 s4, s4, s6
+; GFX10-NEXT: s_add_i32 s3, s3, s2
+; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10-NEXT: s_cselect_b32 s0, 0, s4
+; GFX10-NEXT: s_cselect_b32 s1, 0, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -423,28 +423,28 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX11-LABEL: umulo_i64_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s7, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
-; GFX11-NEXT: s_mul_hi_u32 s9, s1, s3
-; GFX11-NEXT: s_mul_i32 s1, s1, s3
-; GFX11-NEXT: s_add_u32 s3, s8, s7
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_u32 s3, s3, s6
-; GFX11-NEXT: s_addc_u32 s3, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s3, s1
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_i32 s1, s8, s7
-; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_mul_i32 s3, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX11-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX11-NEXT: s_mul_i32 s2, s5, s6
+; GFX11-NEXT: s_mul_hi_u32 s9, s5, s7
+; GFX11-NEXT: s_mul_i32 s5, s5, s7
+; GFX11-NEXT: s_add_u32 s7, s8, s3
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_add_u32 s7, s7, s2
+; GFX11-NEXT: s_addc_u32 s0, s1, s0
+; GFX11-NEXT: s_addc_u32 s1, s9, 0
+; GFX11-NEXT: s_add_u32 s0, s0, s5
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_add_i32 s3, s8, s3
+; GFX11-NEXT: s_mul_i32 s4, s4, s6
+; GFX11-NEXT: s_add_i32 s3, s3, s2
+; GFX11-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, 0, s4
+; GFX11-NEXT: s_cselect_b32 s1, 0, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -454,26 +454,26 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX12-LABEL: umulo_i64_s:
; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
-; GFX12-NEXT: s_mul_i32 s6, s0, s3
-; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT: s_mul_i32 s10, s1, s2
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT: s_mul_hi_u32 s11, s1, s3
-; GFX12-NEXT: s_add_co_u32 s4, s6, s10
-; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9
-; GFX12-NEXT: s_mul_i32 s8, s1, s3
+; GFX12-NEXT: s_mul_hi_u32 s3, s4, s7
+; GFX12-NEXT: s_mul_i32 s2, s4, s7
+; GFX12-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX12-NEXT: s_mul_i32 s10, s5, s6
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: s_mul_hi_u32 s9, s5, s6
+; GFX12-NEXT: s_mul_hi_u32 s11, s5, s7
+; GFX12-NEXT: s_add_co_u32 s0, s2, s10
+; GFX12-NEXT: s_add_co_ci_u32 s0, s3, s9
+; GFX12-NEXT: s_mul_i32 s8, s5, s7
; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
-; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_mul_u64 s[2:3], s[4:5], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[8:9]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX12-NEXT: s_cselect_b32 s0, 0, s0
-; GFX12-NEXT: s_cselect_b32 s1, 0, s1
+; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12-NEXT: s_cselect_b32 s0, 0, s2
+; GFX12-NEXT: s_cselect_b32 s1, 0, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_nop 0
@@ -540,81 +540,81 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX9-LABEL: smulo_i64_s:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s0, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT: s_add_u32 s9, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s1, s2
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT: s_add_u32 s9, s9, s6
-; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3
-; GFX9-NEXT: s_addc_u32 s4, s5, s4
-; GFX9-NEXT: s_addc_u32 s5, s10, 0
-; GFX9-NEXT: s_mul_i32 s9, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s9
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s2
-; GFX9-NEXT: s_subb_u32 s10, s5, 0
-; GFX9-NEXT: s_cmp_lt_i32 s1, 0
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_cselect_b32 s1, s10, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s0
-; GFX9-NEXT: s_subb_u32 s5, s1, 0
-; GFX9-NEXT: s_cmp_lt_i32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s5, s5, s1
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_add_i32 s1, s8, s7
-; GFX9-NEXT: s_add_i32 s1, s1, s6
-; GFX9-NEXT: s_ashr_i32 s6, s1, 31
-; GFX9-NEXT: s_mov_b32 s7, s6
-; GFX9-NEXT: s_mul_i32 s0, s0, s2
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX9-NEXT: s_cselect_b32 s1, 0, s1
-; GFX9-NEXT: s_cselect_b32 s0, 0, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mul_i32 s3, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT: s_add_u32 s9, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s5, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT: s_add_u32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_i32 s10, s5, s7
+; GFX9-NEXT: s_addc_u32 s0, s1, s0
+; GFX9-NEXT: s_addc_u32 s1, s10, 0
+; GFX9-NEXT: s_mul_i32 s9, s5, s7
+; GFX9-NEXT: s_add_u32 s0, s0, s9
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_sub_u32 s9, s0, s6
+; GFX9-NEXT: s_subb_u32 s10, s1, 0
+; GFX9-NEXT: s_cmp_lt_i32 s5, 0
+; GFX9-NEXT: s_cselect_b32 s0, s9, s0
+; GFX9-NEXT: s_cselect_b32 s1, s10, s1
+; GFX9-NEXT: s_sub_u32 s5, s0, s4
+; GFX9-NEXT: s_subb_u32 s9, s1, 0
+; GFX9-NEXT: s_cmp_lt_i32 s7, 0
+; GFX9-NEXT: s_cselect_b32 s1, s9, s1
+; GFX9-NEXT: s_cselect_b32 s0, s5, s0
+; GFX9-NEXT: s_add_i32 s3, s8, s3
+; GFX9-NEXT: s_add_i32 s5, s3, s2
+; GFX9-NEXT: s_ashr_i32 s2, s5, 31
+; GFX9-NEXT: s_mov_b32 s3, s2
+; GFX9-NEXT: s_mul_i32 s4, s4, s6
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX9-NEXT: s_cselect_b32 s0, 0, s5
+; GFX9-NEXT: s_cselect_b32 s1, 0, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: smulo_i64_s:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s7, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
-; GFX10-NEXT: s_add_u32 s11, s8, s7
-; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT: s_add_u32 s11, s11, s6
-; GFX10-NEXT: s_mul_i32 s10, s1, s3
-; GFX10-NEXT: s_addc_u32 s4, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s4, s10
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_sub_u32 s9, s4, s2
-; GFX10-NEXT: s_subb_u32 s10, s5, 0
-; GFX10-NEXT: s_cmp_lt_i32 s1, 0
-; GFX10-NEXT: s_cselect_b32 s1, s9, s4
-; GFX10-NEXT: s_cselect_b32 s4, s10, s5
-; GFX10-NEXT: s_sub_u32 s9, s1, s0
-; GFX10-NEXT: s_subb_u32 s5, s4, 0
-; GFX10-NEXT: s_cmp_lt_i32 s3, 0
-; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_cselect_b32 s5, s5, s4
-; GFX10-NEXT: s_cselect_b32 s4, s9, s1
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_ashr_i32 s6, s1, 31
-; GFX10-NEXT: s_mov_b32 s7, s6
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_mul_i32 s3, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT: s_mul_i32 s2, s5, s6
+; GFX10-NEXT: s_add_u32 s11, s8, s3
+; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_mul_hi_i32 s9, s5, s7
+; GFX10-NEXT: s_add_u32 s11, s11, s2
+; GFX10-NEXT: s_mul_i32 s10, s5, s7
+; GFX10-NEXT: s_addc_u32 s0, s1, s0
+; GFX10-NEXT: s_addc_u32 s1, s9, 0
+; GFX10-NEXT: s_add_u32 s0, s0, s10
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_sub_u32 s9, s0, s6
+; GFX10-NEXT: s_subb_u32 s10, s1, 0
+; GFX10-NEXT: s_cmp_lt_i32 s5, 0
+; GFX10-NEXT: s_cselect_b32 s0, s9, s0
+; GFX10-NEXT: s_cselect_b32 s1, s10, s1
+; GFX10-NEXT: s_sub_u32 s5, s0, s4
+; GFX10-NEXT: s_subb_u32 s9, s1, 0
+; GFX10-NEXT: s_cmp_lt_i32 s7, 0
+; GFX10-NEXT: s_mul_i32 s4, s4, s6
+; GFX10-NEXT: s_cselect_b32 s1, s9, s1
+; GFX10-NEXT: s_cselect_b32 s0, s5, s0
+; GFX10-NEXT: s_add_i32 s3, s8, s3
+; GFX10-NEXT: s_add_i32 s5, s3, s2
+; GFX10-NEXT: s_ashr_i32 s2, s5, 31
+; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX10-NEXT: s_cselect_b32 s0, 0, s4
+; GFX10-NEXT: s_cselect_b32 s1, 0, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -622,42 +622,42 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX11-LABEL: smulo_i64_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s7, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
-; GFX11-NEXT: s_add_u32 s11, s8, s7
-; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX11-NEXT: s_add_u32 s11, s11, s6
-; GFX11-NEXT: s_mul_i32 s10, s1, s3
-; GFX11-NEXT: s_addc_u32 s4, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s4, s10
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_sub_u32 s9, s4, s2
-; GFX11-NEXT: s_subb_u32 s10, s5, 0
-; GFX11-NEXT: s_cmp_lt_i32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, s9, s4
-; GFX11-NEXT: s_cselect_b32 s4, s10, s5
-; GFX11-NEXT: s_sub_u32 s9, s1, s0
-; GFX11-NEXT: s_subb_u32 s5, s4, 0
-; GFX11-NEXT: s_cmp_lt_i32 s3, 0
-; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_cselect_b32 s5, s5, s4
-; GFX11-NEXT: s_cselect_b32 s4, s9, s1
-; GFX11-NEXT: s_add_i32 s1, s8, s7
+; GFX11-NEXT: s_mul_i32 s3, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX11-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX11-NEXT: s_mul_i32 s2, s5, s6
+; GFX11-NEXT: s_add_u32 s11, s8, s3
+; GFX11-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_mul_hi_i32 s9, s5, s7
+; GFX11-NEXT: s_add_u32 s11, s11, s2
+; GFX11-NEXT: s_mul_i32 s10, s5, s7
+; GFX11-NEXT: s_addc_u32 s0, s1, s0
+; GFX11-NEXT: s_addc_u32 s1, s9, 0
+; GFX11-NEXT: s_add_u32 s0, s0, s10
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_sub_u32 s9, s0, s6
+; GFX11-NEXT: s_subb_u32 s10, s1, 0
+; GFX11-NEXT: s_cmp_lt_i32 s5, 0
+; GFX11-NEXT: s_cselect_b32 s0, s9, s0
+; GFX11-NEXT: s_cselect_b32 s1, s10, s1
+; GFX11-NEXT: s_sub_u32 s5, s0, s4
+; GFX11-NEXT: s_subb_u32 s9, s1, 0
+; GFX11-NEXT: s_cmp_lt_i32 s7, 0
+; GFX11-NEXT: s_mul_i32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s1, s9, s1
+; GFX11-NEXT: s_cselect_b32 s0, s5, s0
+; GFX11-NEXT: s_add_i32 s3, s8, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_ashr_i32 s6, s1, 31
+; GFX11-NEXT: s_add_i32 s5, s3, s2
+; GFX11-NEXT: s_ashr_i32 s2, s5, 31
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s7, s6
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_mov_b32 s3, s2
+; GFX11-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX11-NEXT: s_cselect_b32 s0, 0, s4
+; GFX11-NEXT: s_cselect_b32 s1, 0, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -667,39 +667,39 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX12-LABEL: smulo_i64_s:
; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
-; GFX12-NEXT: s_mul_i32 s6, s0, s3
-; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT: s_mul_i32 s10, s1, s2
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT: s_mul_hi_i32 s11, s1, s3
-; GFX12-NEXT: s_add_co_u32 s4, s6, s10
-; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9
-; GFX12-NEXT: s_mul_i32 s8, s1, s3
+; GFX12-NEXT: s_mul_hi_u32 s3, s4, s7
+; GFX12-NEXT: s_mul_i32 s2, s4, s7
+; GFX12-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX12-NEXT: s_mul_i32 s10, s5, s6
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: s_mul_hi_u32 s9, s5, s6
+; GFX12-NEXT: s_mul_hi_i32 s11, s5, s7
+; GFX12-NEXT: s_add_co_u32 s0, s2, s10
+; GFX12-NEXT: s_add_co_ci_u32 s0, s3, s9
+; GFX12-NEXT: s_mul_i32 s8, s5, s7
; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
-; GFX12-NEXT: s_cmp_lt_i32 s1, 0
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[8:9]
-; GFX12-NEXT: s_mov_b32 s4, s2
+; GFX12-NEXT: s_cmp_lt_i32 s5, 0
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[8:9]
+; GFX12-NEXT: s_mov_b32 s0, s6
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_sub_nc_u64 s[8:9], s[6:7], s[4:5]
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_cselect_b32 s7, s9, s7
-; GFX12-NEXT: s_cselect_b32 s6, s8, s6
-; GFX12-NEXT: s_cmp_lt_i32 s3, 0
-; GFX12-NEXT: s_sub_nc_u64 s[4:5], s[6:7], s[4:5]
-; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_cselect_b32 s3, s5, s7
-; GFX12-NEXT: s_cselect_b32 s2, s4, s6
-; GFX12-NEXT: s_ashr_i32 s4, s1, 31
+; GFX12-NEXT: s_sub_nc_u64 s[8:9], s[2:3], s[0:1]
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_cselect_b32 s3, s9, s3
+; GFX12-NEXT: s_cselect_b32 s2, s8, s2
+; GFX12-NEXT: s_cmp_lt_i32 s7, 0
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_cselect_b32 s1, s1, s3
+; GFX12-NEXT: s_cselect_b32 s0, s0, s2
+; GFX12-NEXT: s_ashr_i32 s2, s5, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s5, s4
-; GFX12-NEXT: s_cmp_lg_u64 s[2:3], s[4:5]
-; GFX12-NEXT: s_cselect_b32 s0, 0, s0
-; GFX12-NEXT: s_cselect_b32 s1, 0, s1
+; GFX12-NEXT: s_mov_b32 s3, s2
+; GFX12-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX12-NEXT: s_cselect_b32 s0, 0, s4
+; GFX12-NEXT: s_cselect_b32 s1, 0, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 9fcbdf3..27ea3e8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -136,12 +136,12 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
; VI-LABEL: local_size_xy:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -179,14 +179,14 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
;
; VI-LABEL: local_size_xz:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x18
-; VI-NEXT: s_load_dword s3, s[0:1], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dword s5, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -269,14 +269,14 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
; VI-LABEL: local_size_xyz:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
-; VI-NEXT: s_load_dword s4, s[0:1], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s6, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: s_add_i32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: s_add_i32 s0, s0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 84afa3b0..18c910a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -30,38 +30,38 @@ define amdgpu_kernel void @rint_f16(
;
; GFX89-LABEL: rint_f16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_rndne_f16_e32 v0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: rint_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -111,64 +111,64 @@ define amdgpu_kernel void @rint_v2f16(
;
; VI-LABEL: rint_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_rndne_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: rint_v2f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rndne_f16_e32 v1, v0
; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: rint_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index c5d2f79..d1e2008 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -24,60 +24,42 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_f32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f32_e32 v0, s6
-; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX8-NEXT: s_brev_b32 s4, -2
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f32_e32 v0, s2
-; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: s_brev_b32 s0, -2
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_f32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f32_e32 v0, s2
+; GFX89-NEXT: v_sub_f32_e32 v1, s2, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: s_brev_b32 s0, -2
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_bfi_b32 v1, s0, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0
-; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v1|, 0.5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,90 +235,52 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_v4f32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_brev_b32 s10, -2
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f32_e32 v0, s7
-; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s6
-; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s5
-; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4
-; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s4
-; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_v4f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-NEXT: s_brev_b32 s2, -2
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f32_e32 v0, s7
-; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s6
-; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s5
-; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4
-; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s4
-; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_v4f32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX89-NEXT: s_brev_b32 s2, -2
+; GFX89-NEXT: s_mov_b32 s11, 0xf000
+; GFX89-NEXT: s_mov_b32 s10, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f32_e32 v0, s7
+; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v2, s7
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v3, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s6
+; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v2, s6
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v2, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s5
+; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v4, s5
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4
+; GFX89-NEXT: v_add_f32_e32 v1, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s4
+; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v5, s4
+; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5
+; GFX89-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
@@ -346,27 +290,26 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1
; GFX11-NEXT: v_dual_sub_f32 v6, s5, v4 :: v_dual_sub_f32 v7, s4, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v6|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v6|, 0.5
; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v7|, 0.5
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4
; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -481,77 +424,78 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX89-LABEL: round_v8f32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX89-NEXT: s_brev_b32 s14, -2
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GFX89-NEXT: s_brev_b32 s2, -2
+; GFX89-NEXT: s_mov_b32 s15, 0xf000
+; GFX89-NEXT: s_mov_b32 s14, -1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_trunc_f32_e32 v0, s7
; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v2, s7
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX89-NEXT: v_add_f32_e32 v3, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s6
; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v2, s6
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX89-NEXT: v_add_f32_e32 v2, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s5
; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v4, s5
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4
; GFX89-NEXT: v_add_f32_e32 v1, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s4
; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v5, s4
-; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5
+; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5
; GFX89-NEXT: v_add_f32_e32 v0, v0, v4
; GFX89-NEXT: v_trunc_f32_e32 v4, s11
; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v6, s11
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX89-NEXT: v_add_f32_e32 v7, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s10
; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v6, s10
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX89-NEXT: v_add_f32_e32 v6, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s9
; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v8, s9
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8
; GFX89-NEXT: v_add_f32_e32 v5, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s8
; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v9, s8
-; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9
+; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9
; GFX89-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_v8f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
@@ -564,57 +508,56 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX11-NEXT: v_trunc_f32_e32 v9, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_f32_e32 v12, s11, v5
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v11, s4, v8
; GFX11-NEXT: v_trunc_f32_e32 v6, s10
; GFX11-NEXT: v_sub_f32_e32 v14, s9, v9
; GFX11-NEXT: v_trunc_f32_e32 v10, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v3|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v7|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v11|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v13, s10, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v12|, 0.5
; GFX11-NEXT: v_add_f32_e32 v1, v4, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v13|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s11
-; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v14|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v7, v5, v12
; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s10
; GFX11-NEXT: v_sub_f32_e32 v15, s8, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s0
; GFX11-NEXT: v_add_f32_e32 v6, v6, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v15|, 0.5
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s0
; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s8
; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[12:15], 0 offset:16
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[12:15], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -702,62 +645,43 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX8-NEXT: s_movk_i32 s5, 0x7fff
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f16_e32 v1, s4
-; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1
-; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX9-NEXT: s_movk_i32 s0, 0x7fff
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f16_e32 v1, s2
-; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1
-; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00
+; GFX89-NEXT: s_movk_i32 s0, 0x7fff
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f16_e32 v1, s2
+; GFX89-NEXT: v_sub_f16_e32 v2, s2, v1
+; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_bfi_b32 v0, s0, v0, v2
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f16_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0
-; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v1|, 0.5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -828,30 +752,30 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
;
; GFX8-LABEL: round_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX8-NEXT: s_movk_i32 s6, 0x7fff
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_movk_i32 s1, 0x7fff
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s5, s4, 16
-; GFX8-NEXT: v_trunc_f16_e32 v1, s5
-; GFX8-NEXT: v_sub_f16_e32 v2, s5, v1
+; GFX8-NEXT: s_lshr_b32 s0, s2, 16
+; GFX8-NEXT: v_trunc_f16_e32 v1, s0
+; GFX8-NEXT: v_sub_f16_e32 v2, s0, v1
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_bfi_b32 v2, s6, v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: v_bfi_b32 v2, s1, v2, v3
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_trunc_f16_e32 v2, s4
-; GFX8-NEXT: v_sub_f16_e32 v3, s4, v2
+; GFX8-NEXT: v_trunc_f16_e32 v2, s2
+; GFX8-NEXT: v_sub_f16_e32 v3, s2, v2
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_bfi_b32 v0, s6, v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v3
; GFX8-NEXT: v_add_f16_e32 v0, v2, v0
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_v2f16:
@@ -886,7 +810,9 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-NEXT: v_trunc_f16_e32 v1, s2
@@ -895,22 +821,20 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1
; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s0
; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_add_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 2ce0a62..4082ad7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -30,55 +30,55 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: sin_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_sin_f16_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sin_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sin_f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sin_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -121,10 +121,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: sin_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -134,50 +134,50 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_sin_f16_e32 v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sin_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_sin_f16_e32 v2, v3
; GFX9-NEXT: v_sin_f16_e32 v1, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sin_v2f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_sin_f16_e32 v2, v3
; GFX10-NEXT: v_sin_f16_e32 v1, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sin_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
@@ -188,7 +188,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_sin_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index f2d57ba9..dc19189 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -29,38 +29,38 @@ define amdgpu_kernel void @sqrt_f16(
;
; VI-LABEL: sqrt_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sqrt_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sqrt_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -109,37 +109,37 @@ define amdgpu_kernel void @sqrt_v2f16(
;
; VI-LABEL: sqrt_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_sqrt_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sqrt_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
@@ -147,7 +147,7 @@ define amdgpu_kernel void @sqrt_v2f16(
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index d1e2ddc..3fb1699 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -29,38 +29,38 @@ define amdgpu_kernel void @trunc_f16(
;
; VI-LABEL: trunc_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: trunc_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_trunc_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -110,44 +110,44 @@ define amdgpu_kernel void @trunc_v2f16(
;
; VI-LABEL: trunc_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_trunc_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_trunc_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: trunc_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_trunc_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index cfaefca..9de4eae 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -34,26 +34,26 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-NOHSA-LABEL: constant_load_f64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; GFX12-LABEL: constant_load_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 502cd14..876c246 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -25,13 +25,13 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: constant_load_i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
@@ -65,14 +65,14 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
;
; GFX12-LABEL: constant_load_i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -101,13 +101,13 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v2i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -140,12 +140,12 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v2i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -174,13 +174,13 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v3i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -212,12 +212,12 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v3i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -246,13 +246,13 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v4i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -285,12 +285,12 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v4i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -319,13 +319,13 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v8i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -358,12 +358,12 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v8i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -392,13 +392,13 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v16i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -431,12 +431,12 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v16i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -460,13 +460,13 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v32i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -488,12 +488,12 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v32i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -518,14 +518,14 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v64i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -547,13 +547,13 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v64i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -582,13 +582,13 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_zextload_i1_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -611,12 +611,12 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i1_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -647,13 +647,13 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_sextload_i1_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -678,14 +678,14 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i1_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -715,13 +715,13 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -744,12 +744,12 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -780,13 +780,13 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -811,14 +811,14 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -850,13 +850,13 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v2i1_to_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2
@@ -884,17 +884,17 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i1_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -926,13 +926,13 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v2i1_to_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
@@ -961,16 +961,16 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i1_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1004,13 +1004,13 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v3i1_to_v3i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_and_b32_e32 v5, 1, v0
@@ -1046,10 +1046,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i1_to_v3i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v3, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -1060,7 +1060,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1094,13 +1094,13 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v3i1_to_v3i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1137,10 +1137,10 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i1_to_v3i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v3, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0
@@ -1148,7 +1148,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v4, 0, 1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1182,13 +1182,13 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v4i1_to_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1226,10 +1226,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i1_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
@@ -1244,7 +1244,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1278,13 +1278,13 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v4i1_to_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1323,10 +1323,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i1_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -1337,7 +1337,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_bfe_i32 v1, v5, 0, 1
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1376,17 +1376,17 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v1, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 5, v1
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1
@@ -1443,10 +1443,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0
; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0
@@ -1467,8 +1467,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v10
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1507,17 +1507,17 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v8i1_to_v8i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 5, v0
@@ -1578,10 +1578,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i1_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
@@ -1599,8 +1599,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v9, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1649,25 +1649,25 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v1, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, 1
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1
@@ -1767,10 +1767,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0
; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0
@@ -1811,10 +1811,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v22
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1863,24 +1863,24 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v16i1_to_v16i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v12, 12, v0
@@ -1990,10 +1990,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i1_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0
@@ -2027,10 +2027,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v19, 0, 1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2132,112 +2132,112 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2
; GFX8-NEXT: v_and_b32_e32 v24, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
; GFX8-NEXT: v_and_b32_e32 v22, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2
; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
; GFX8-NEXT: v_and_b32_e32 v23, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT: s_lshr_b32 s2, s4, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
+; GFX8-NEXT: s_lshr_b32 s0, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s2
; GFX8-NEXT: v_and_b32_e32 v26, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v2
; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s0
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s2
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x10018
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s2
-; GFX8-NEXT: s_and_b32 s6, s4, 1
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x10013
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x10012
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x10011
-; GFX8-NEXT: s_bfe_u32 s10, s4, 0x10010
-; GFX8-NEXT: s_bfe_u32 s2, s4, 0x10017
-; GFX8-NEXT: s_bfe_u32 s3, s4, 0x10016
-; GFX8-NEXT: s_bfe_u32 s11, s4, 0x10015
-; GFX8-NEXT: s_bfe_u32 s12, s4, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v11, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v10, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s0
+; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10018
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s0
+; GFX8-NEXT: s_and_b32 s6, s2, 1
+; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10013
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x10012
+; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10011
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10010
+; GFX8-NEXT: s_bfe_u32 s0, s2, 0x10017
+; GFX8-NEXT: s_bfe_u32 s1, s2, 0x10016
+; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10015
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10014
+; GFX8-NEXT: v_mov_b32_e32 v11, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s11
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s4
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s2
; GFX8-NEXT: v_mov_b32_e32 v8, s10
; GFX8-NEXT: v_mov_b32_e32 v9, s9
; GFX8-NEXT: v_mov_b32_e32 v10, s8
; GFX8-NEXT: v_mov_b32_e32 v11, s7
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v22
; GFX8-NEXT: v_and_b32_e32 v10, 1, v25
; GFX8-NEXT: v_and_b32_e32 v22, 1, v21
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v24
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s4
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s2
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v23
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2
; GFX8-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s2
; GFX8-NEXT: v_mov_b32_e32 v8, 1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_and_b32_e32 v12, 1, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX8-NEXT: v_and_b32_sdwa v16, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX8-NEXT: v_mov_b32_e32 v17, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v26
; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
@@ -2245,7 +2245,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -2349,56 +2349,56 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v10, 1, s2
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v1, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v10, 1, s0
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s0
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s3, s0, 1
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s1
; GFX12-NEXT: v_and_b32_e32 v25, 1, v14
; GFX12-NEXT: v_and_b32_e32 v26, 1, v18
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10013
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10012
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
-; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017
+; GFX12-NEXT: s_bfe_u32 s7, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10010
+; GFX12-NEXT: s_bfe_u32 s9, s0, 0x10017
; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016
+; GFX12-NEXT: s_bfe_u32 s10, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10014
+; GFX12-NEXT: s_bfe_u32 s11, s0, 0x10014
; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
-; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
+; GFX12-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_and_b32 v6, 1, v7
; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
; GFX12-NEXT: v_and_b32_e32 v10, 1, v3
; GFX12-NEXT: v_and_b32_e32 v14, 1, v19
@@ -2412,23 +2412,23 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20
; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v20, 1, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25
-; GFX12-NEXT: v_mov_b32_e32 v25, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v17, 0xffff, v25
+; GFX12-NEXT: v_mov_b32_e32 v25, s0
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v22
; GFX12-NEXT: v_and_b32_e32 v22, 1, v12
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_and_b32 v15, 0xffff, v21
; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v33
; GFX12-NEXT: v_and_b32_e32 v8, 1, v8
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2530,111 +2530,111 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v32i1_to_v32i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s3, s2, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 13, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 8, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 11, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v25, 1, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 3, s3
-; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_i32 s5, s2, 0x10000
-; GFX8-NEXT: s_bfe_i32 s6, s2, 0x10013
-; GFX8-NEXT: s_bfe_i32 s7, s2, 0x10012
-; GFX8-NEXT: s_bfe_i32 s8, s2, 0x10011
-; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10010
-; GFX8-NEXT: s_bfe_i32 s3, s2, 0x10017
-; GFX8-NEXT: s_bfe_i32 s10, s2, 0x10016
-; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10015
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 13, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 8, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 11, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v25, 1, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 3, s1
+; GFX8-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_i32 s3, s0, 0x10000
+; GFX8-NEXT: s_bfe_i32 s6, s0, 0x10013
+; GFX8-NEXT: s_bfe_i32 s7, s0, 0x10012
+; GFX8-NEXT: s_bfe_i32 s8, s0, 0x10011
+; GFX8-NEXT: s_bfe_i32 s9, s0, 0x10010
+; GFX8-NEXT: s_bfe_i32 s1, s0, 0x10017
+; GFX8-NEXT: s_bfe_i32 s10, s0, 0x10016
+; GFX8-NEXT: s_bfe_i32 s11, s0, 0x10015
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x10014
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_mov_b32_e32 v1, s8
; GFX8-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_bfe_i32 v5, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v1, v25, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_bfe_i32 v4, v23, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v22, 0, 1
; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1
; GFX8-NEXT: v_bfe_i32 v20, v8, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v17, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, s5
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NEXT: v_bfe_i32 v3, v27, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_bfe_i32 v2, v26, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -2770,48 +2770,48 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i1_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 11, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v1, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 3, s2
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000
-; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013
-; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012
-; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10010
-; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10017
-; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016
-; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 11, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 3, s0
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX12-NEXT: s_bfe_i32 s3, s0, 0x10000
+; GFX12-NEXT: s_bfe_i32 s6, s0, 0x10013
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s1
+; GFX12-NEXT: s_bfe_i32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x10010
+; GFX12-NEXT: s_bfe_i32 s9, s0, 0x10017
+; GFX12-NEXT: s_bfe_i32 s10, s0, 0x10016
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x10014
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s0
; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
-; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s3
+; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s1
; GFX12-NEXT: v_bfe_i32 v14, v13, 0, 1
; GFX12-NEXT: v_bfe_i32 v13, v12, 0, 1
; GFX12-NEXT: v_bfe_i32 v12, v0, 0, 1
@@ -2828,7 +2828,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_bfe_i32 v23, v22, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v21, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v20, 0, 1
@@ -2837,16 +2837,16 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: v_mov_b32_e32 v16, s4
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v16, s2
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3025,99 +3025,99 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s6, s3, 24
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s7, s3, 1
-; GFX8-NEXT: s_and_b32 s9, s2, 1
-; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014
+; GFX8-NEXT: s_lshr_b32 s6, s1, 24
+; GFX8-NEXT: s_lshr_b32 s8, s0, 24
+; GFX8-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX8-NEXT: s_and_b32 s7, s1, 1
+; GFX8-NEXT: s_and_b32 s9, s0, 1
+; GFX8-NEXT: s_bfe_u32 s12, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s13, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s14, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s15, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s16, s0, 0x10017
+; GFX8-NEXT: s_bfe_u32 s17, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s18, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s19, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s20, s1, 0x10013
+; GFX8-NEXT: s_bfe_u32 s21, s1, 0x10012
+; GFX8-NEXT: s_bfe_u32 s22, s1, 0x10011
+; GFX8-NEXT: s_bfe_u32 s23, s1, 0x10010
+; GFX8-NEXT: s_bfe_u32 s10, s1, 0x10017
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x10016
+; GFX8-NEXT: s_bfe_u32 s24, s1, 0x10015
+; GFX8-NEXT: s_bfe_u32 s25, s1, 0x10014
; GFX8-NEXT: v_mov_b32_e32 v25, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xd0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v24, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xc0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v22, s25
; GFX8-NEXT: v_mov_b32_e32 v23, s24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NEXT: v_mov_b32_e32 v22, s23
; GFX8-NEXT: v_mov_b32_e32 v23, s22
; GFX8-NEXT: v_mov_b32_e32 v24, s21
; GFX8-NEXT: v_mov_b32_e32 v25, s20
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 64
+; GFX8-NEXT: s_add_u32 s10, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v22, s19
; GFX8-NEXT: v_mov_b32_e32 v23, s18
; GFX8-NEXT: v_mov_b32_e32 v24, s17
; GFX8-NEXT: v_mov_b32_e32 v25, s16
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s14
; GFX8-NEXT: v_mov_b32_e32 v24, s13
; GFX8-NEXT: v_mov_b32_e32 v25, s12
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s1
; GFX8-NEXT: v_mov_b32_e32 v25, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s0
; GFX8-NEXT: v_and_b32_e32 v21, 1, v0
; GFX8-NEXT: v_and_b32_e32 v27, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s1
; GFX8-NEXT: v_mov_b32_e32 v24, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 32
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
+; GFX8-NEXT: s_add_u32 s10, s4, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s0
; GFX8-NEXT: v_and_b32_e32 v28, 1, v22
; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s0
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX8-NEXT: v_and_b32_e32 v20, 1, v19
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
; GFX8-NEXT: v_and_b32_e32 v18, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s1
; GFX8-NEXT: v_mov_b32_e32 v25, 1
; GFX8-NEXT: v_mov_b32_e32 v21, s11
; GFX8-NEXT: v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s0
; GFX8-NEXT: v_and_b32_e32 v23, 1, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
@@ -3129,129 +3129,129 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v20, 1, v14
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6
-; GFX8-NEXT: s_add_u32 s10, s0, 16
+; GFX8-NEXT: s_add_u32 s10, s4, 16
; GFX8-NEXT: v_and_b32_e32 v17, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s0
; GFX8-NEXT: v_and_b32_e32 v19, 1, v15
; GFX8-NEXT: v_mov_b32_e32 v16, s11
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s0
; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX8-NEXT: v_and_b32_e32 v11, 1, v11
; GFX8-NEXT: v_mov_b32_e32 v15, s10
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_and_b32_e32 v15, 1, v11
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10
; GFX8-NEXT: v_and_b32_e32 v10, 1, v9
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8
; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s10, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xb0
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s1
; GFX8-NEXT: v_and_b32_e32 v6, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s1
; GFX8-NEXT: v_and_b32_e32 v11, 1, v8
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v9, s10
; GFX8-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s1
; GFX8-NEXT: v_and_b32_e32 v13, 1, v8
; GFX8-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s1
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
; GFX8-NEXT: v_mov_b32_e32 v10, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 0xa0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s1
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8]
-; GFX8-NEXT: s_add_u32 s2, s0, 0xa0
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v16
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 1, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x90
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x90
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20
; GFX8-NEXT: v_and_b32_e32 v20, 1, v0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x80
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x80
; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23
; GFX8-NEXT: v_and_b32_e32 v3, 1, v22
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v27
; GFX8-NEXT: v_and_b32_e32 v22, 1, v26
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6
; GFX8-NEXT: v_and_b32_e32 v16, 1, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_and_b32_e32 v6, 1, v12
-; GFX8-NEXT: v_mov_b32_e32 v12, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v12, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
@@ -3444,113 +3444,113 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v2, 13, s0
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v3, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v3, 9, s0
; GFX12-NEXT: v_and_b32_e32 v45, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 5, s4
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s14, s3, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v14, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 5, s2
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: s_and_b32 s6, s1, 1
+; GFX12-NEXT: s_bfe_u32 s14, s1, 0x10012
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10
; GFX12-NEXT: v_and_b32_e32 v10, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 3, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 3, s1
; GFX12-NEXT: v_and_b32_e32 v43, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s3
+; GFX12-NEXT: s_bfe_u32 s19, s1, 0x10014
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3
-; GFX12-NEXT: v_lshrrev_b16 v3, 5, s5
-; GFX12-NEXT: s_bfe_u32 s13, s3, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v29, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v3, 5, s3
+; GFX12-NEXT: s_bfe_u32 s13, s1, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v29, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s1
; GFX12-NEXT: v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s1
; GFX12-NEXT: v_and_b32_e32 v12, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s5
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10011
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s3
+; GFX12-NEXT: s_and_b32 s7, s0, 1
+; GFX12-NEXT: s_bfe_u32 s15, s1, 0x10011
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s5
-; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10010
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s3
+; GFX12-NEXT: s_bfe_u32 s16, s1, 0x10010
; GFX12-NEXT: v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v7, 2, s5
-; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, s3
+; GFX12-NEXT: s_bfe_u32 s17, s1, 0x10017
+; GFX12-NEXT: s_bfe_u32 s18, s1, 0x10016
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v51, s18 :: v_dual_and_b32 v44, 1, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s5
-; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s3
+; GFX12-NEXT: s_bfe_u32 s13, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v23, 1, v14
; GFX12-NEXT: v_and_b32_e32 v14, 1, v18
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s5
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
+; GFX12-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x10015
; GFX12-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_and_b32 v39, 1, v6
; GFX12-NEXT: v_and_b32_e32 v32, 1, v11
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s4
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
+; GFX12-NEXT: s_bfe_u32 s9, s0, 0x10012
; GFX12-NEXT: v_and_b32_e32 v20, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s4
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s2
+; GFX12-NEXT: s_bfe_u32 s11, s0, 0x10010
; GFX12-NEXT: v_and_b32_e32 v24, 1, v15
-; GFX12-NEXT: v_lshrrev_b16 v15, 6, s4
-; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT: v_mov_b32_e32 v50, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v15, 6, s2
+; GFX12-NEXT: s_bfe_u32 s12, s0, 0x10017
+; GFX12-NEXT: v_mov_b32_e32 v50, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v28, 1, v13
-; GFX12-NEXT: v_lshrrev_b16 v13, 4, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT: v_lshrrev_b16 v13, 4, s2
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
; GFX12-NEXT: v_and_b32_e32 v6, 1, v3
; GFX12-NEXT: v_and_b32_e32 v3, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v17, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v1, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v41, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v37, 4, s2
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v17, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v1, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v41, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v37, 4, s0
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10013
+; GFX12-NEXT: s_bfe_u32 s10, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10014
; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
; GFX12-NEXT: v_and_b32_e32 v29, 1, v29
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:192
; GFX12-NEXT: v_mov_b32_e32 v52, s12
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v54, s10 :: v_dual_and_b32 v3, 1, v7
; GFX12-NEXT: v_dual_mov_b32 v56, s8 :: v_dual_and_b32 v7, 1, v18
-; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s13
-; GFX12-NEXT: v_mov_b32_e32 v51, s3
+; GFX12-NEXT: v_dual_mov_b32 v49, s0 :: v_dual_mov_b32 v50, s13
+; GFX12-NEXT: v_mov_b32_e32 v51, s1
; GFX12-NEXT: v_dual_mov_b32 v53, s11 :: v_dual_and_b32 v18, 0xffff, v24
; GFX12-NEXT: v_and_b32_e32 v24, 0xffff, v23
; GFX12-NEXT: v_and_b32_e32 v23, 1, v22
@@ -3583,28 +3583,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v33, s7 :: v_dual_and_b32 v14, 0xffff, v14
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_and_b32 v42, 0xffff, v42
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_and_b32 v42, 0xffff, v42
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX12-NEXT: v_and_b32_e32 v5, 1, v5
; GFX12-NEXT: v_and_b32_e32 v37, 1, v37
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v6, 0xffff, v6
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_clause 0xd
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1]
-; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:160
-; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[4:5]
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[4:5] offset:160
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3783,84 +3783,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s2
-; GFX8-NEXT: s_lshr_b32 s7, s3, 24
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10018
-; GFX8-NEXT: s_bfe_i32 s6, s3, 0x10000
-; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10000
-; GFX8-NEXT: s_bfe_i32 s12, s2, 0x10013
-; GFX8-NEXT: s_bfe_i32 s13, s2, 0x10012
-; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10011
-; GFX8-NEXT: s_bfe_i32 s15, s2, 0x10010
-; GFX8-NEXT: s_bfe_i32 s16, s2, 0x10017
-; GFX8-NEXT: s_bfe_i32 s17, s2, 0x10016
-; GFX8-NEXT: s_bfe_i32 s18, s2, 0x10015
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014
-; GFX8-NEXT: s_bfe_i32 s19, s3, 0x10013
-; GFX8-NEXT: s_bfe_i32 s20, s3, 0x10012
-; GFX8-NEXT: s_bfe_i32 s21, s3, 0x10011
-; GFX8-NEXT: s_bfe_i32 s22, s3, 0x10010
-; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10017
-; GFX8-NEXT: s_bfe_i32 s11, s3, 0x10016
-; GFX8-NEXT: s_bfe_i32 s23, s3, 0x10015
-; GFX8-NEXT: s_bfe_i32 s24, s3, 0x10014
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s0
+; GFX8-NEXT: s_lshr_b32 s7, s1, 24
+; GFX8-NEXT: s_lshr_b32 s8, s0, 24
+; GFX8-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_i32 s3, s1, 0x10018
+; GFX8-NEXT: s_bfe_i32 s6, s1, 0x10000
+; GFX8-NEXT: s_bfe_i32 s9, s0, 0x10000
+; GFX8-NEXT: s_bfe_i32 s12, s0, 0x10013
+; GFX8-NEXT: s_bfe_i32 s13, s0, 0x10012
+; GFX8-NEXT: s_bfe_i32 s14, s0, 0x10011
+; GFX8-NEXT: s_bfe_i32 s15, s0, 0x10010
+; GFX8-NEXT: s_bfe_i32 s16, s0, 0x10017
+; GFX8-NEXT: s_bfe_i32 s17, s0, 0x10016
+; GFX8-NEXT: s_bfe_i32 s18, s0, 0x10015
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x10014
+; GFX8-NEXT: s_bfe_i32 s19, s1, 0x10013
+; GFX8-NEXT: s_bfe_i32 s20, s1, 0x10012
+; GFX8-NEXT: s_bfe_i32 s21, s1, 0x10011
+; GFX8-NEXT: s_bfe_i32 s22, s1, 0x10010
+; GFX8-NEXT: s_bfe_i32 s10, s1, 0x10017
+; GFX8-NEXT: s_bfe_i32 s11, s1, 0x10016
+; GFX8-NEXT: s_bfe_i32 s23, s1, 0x10015
+; GFX8-NEXT: s_bfe_i32 s24, s1, 0x10014
; GFX8-NEXT: v_mov_b32_e32 v25, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xd0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v24, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xc0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v22, s24
; GFX8-NEXT: v_mov_b32_e32 v23, s23
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NEXT: v_mov_b32_e32 v22, s22
; GFX8-NEXT: v_mov_b32_e32 v23, s21
; GFX8-NEXT: v_mov_b32_e32 v24, s20
; GFX8-NEXT: v_mov_b32_e32 v25, s19
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 64
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s10, s4, 64
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: v_mov_b32_e32 v23, s18
; GFX8-NEXT: v_mov_b32_e32 v24, s17
; GFX8-NEXT: v_mov_b32_e32 v25, s16
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s14
; GFX8-NEXT: v_mov_b32_e32 v24, s13
; GFX8-NEXT: v_mov_b32_e32 v25, s12
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v23, s11
@@ -3868,58 +3868,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v22, s10
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: v_mov_b32_e32 v15, s1
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
+; GFX8-NEXT: v_mov_b32_e32 v14, s0
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NEXT: s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT: v_mov_b32_e32 v12, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT: v_mov_b32_e32 v12, s5
; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1
; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v7, s9
-; GFX8-NEXT: v_mov_b32_e32 v11, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s8
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s8
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1
; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xa0
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xa0
; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s8
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
@@ -3929,21 +3929,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v7, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v11, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v0, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v13, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v12, v2, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x90
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x90
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v12, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v23, 0, 1
@@ -3951,48 +3951,48 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v24, v28, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v27, 0, 1
; GFX8-NEXT: v_bfe_i32 v22, v26, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x80
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x80
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[22:25]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v11, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v18, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v13, v2, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v10, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s8
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8-NEXT: s_endpgm
@@ -4244,82 +4244,82 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v64i1_to_v64i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v29, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v27, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s5
-; GFX12-NEXT: v_lshrrev_b16 v6, 6, s5
-; GFX12-NEXT: v_lshrrev_b16 v1, 3, s5
-; GFX12-NEXT: v_lshrrev_b16 v2, 2, s5
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s5
-; GFX12-NEXT: v_lshrrev_b16 v44, 7, s5
-; GFX12-NEXT: s_bfe_i32 s5, s3, 0x10018
-; GFX12-NEXT: s_bfe_i32 s6, s3, 0x10000
-; GFX12-NEXT: s_bfe_i32 s13, s3, 0x10013
-; GFX12-NEXT: s_bfe_i32 s14, s3, 0x10012
-; GFX12-NEXT: s_bfe_i32 s15, s3, 0x10011
-; GFX12-NEXT: s_bfe_i32 s16, s3, 0x10010
-; GFX12-NEXT: s_bfe_i32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_i32 s18, s3, 0x10016
-; GFX12-NEXT: s_bfe_i32 s19, s3, 0x10014
-; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v28, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v29, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v24, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v27, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s1
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v44, 7, s3
+; GFX12-NEXT: s_bfe_i32 s3, s1, 0x10018
+; GFX12-NEXT: s_bfe_i32 s6, s1, 0x10000
+; GFX12-NEXT: s_bfe_i32 s13, s1, 0x10013
+; GFX12-NEXT: s_bfe_i32 s14, s1, 0x10012
+; GFX12-NEXT: s_bfe_i32 s15, s1, 0x10011
+; GFX12-NEXT: s_bfe_i32 s16, s1, 0x10010
+; GFX12-NEXT: s_bfe_i32 s17, s1, 0x10017
+; GFX12-NEXT: s_bfe_i32 s18, s1, 0x10016
+; GFX12-NEXT: s_bfe_i32 s19, s1, 0x10014
+; GFX12-NEXT: s_bfe_i32 s1, s1, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s3
+; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s1
; GFX12-NEXT: v_dual_mov_b32 v48, s19 :: v_dual_mov_b32 v51, s17
; GFX12-NEXT: v_dual_mov_b32 v50, s18 :: v_dual_mov_b32 v53, s15
-; GFX12-NEXT: v_lshrrev_b16 v16, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v16, 14, s0
; GFX12-NEXT: v_dual_mov_b32 v52, s16 :: v_dual_mov_b32 v55, s13
-; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10015
+; GFX12-NEXT: s_bfe_i32 s13, s0, 0x10015
; GFX12-NEXT: v_mov_b32_e32 v54, s14
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4
-; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4
-; GFX12-NEXT: v_lshrrev_b16 v15, 7, s4
-; GFX12-NEXT: v_lshrrev_b16 v40, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v41, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v42, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v43, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v36, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v37, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v39, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v35, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v10, 2, s4
-; GFX12-NEXT: v_lshrrev_b16 v11, 3, s4
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10000
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10013
-; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10012
-; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10011
-; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10010
-; GFX12-NEXT: s_bfe_i32 s12, s2, 0x10017
-; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10016
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v32, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v15, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v40, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v41, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v42, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v43, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v36, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v37, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v39, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v35, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v11, 3, s2
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x10000
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x10013
+; GFX12-NEXT: s_bfe_i32 s9, s0, 0x10012
+; GFX12-NEXT: s_bfe_i32 s10, s0, 0x10011
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x10010
+; GFX12-NEXT: s_bfe_i32 s12, s0, 0x10017
+; GFX12-NEXT: s_bfe_i32 s1, s0, 0x10016
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10014
; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v21, 0, 1
@@ -4329,10 +4329,10 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v29, v29, 0, 1
; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:192
-; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s2
-; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s3
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[4:5] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s0
+; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s1
; GFX12-NEXT: v_mov_b32_e32 v53, s10
; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
@@ -4362,7 +4362,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v42, v42, 0, 1
; GFX12-NEXT: v_bfe_i32 v41, v41, 0, 1
; GFX12-NEXT: v_bfe_i32 v40, v40, 0, 1
-; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_mov_b32_e32 v8, s3
; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1
@@ -4376,22 +4376,22 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v33, v33, 0, 1
; GFX12-NEXT: v_mov_b32_e32 v32, s7
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v56, v[44:47], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v56, v[40:43], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v56, v[32:35], s[0:1]
-; GFX12-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:160
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v56, v[44:47], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v56, v[40:43], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v56, v[36:39], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v56, v[32:35], s[4:5]
+; GFX12-NEXT: global_store_b128 v56, v[28:31], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v56, v[24:27], s[4:5] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v56, v[20:23], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v56, v[16:19], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v56, v[12:15], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v56, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v56, v[4:7], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v56, v[0:3], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4423,14 +4423,14 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_zextload_i1_to_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -4455,14 +4455,14 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i1_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4494,13 +4494,13 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_sextload_i1_to_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -4527,14 +4527,14 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i1_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4566,14 +4566,14 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -4598,14 +4598,14 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4637,13 +4637,13 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -4670,14 +4670,14 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4711,14 +4711,14 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v2i1_to_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v3, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -4749,17 +4749,17 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i1_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4794,13 +4794,13 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v2i1_to_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
@@ -4833,10 +4833,10 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i1_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
@@ -4845,7 +4845,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4882,17 +4882,17 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v3i1_to_v3i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -4900,10 +4900,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v8
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -4935,10 +4935,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i1_to_v3i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v5, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v5, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v2, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
@@ -4950,8 +4950,8 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v3, v5 :: v_dual_and_b32 v4, 0xffff, v3
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v6
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v5, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v5, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4990,17 +4990,17 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -5045,10 +5045,10 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v6, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v6, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v6, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
@@ -5062,8 +5062,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5102,21 +5102,21 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v4i1_to_v4i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v0
@@ -5162,10 +5162,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i1_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v6, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -5182,8 +5182,8 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v9
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5225,17 +5225,17 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 3, v0
@@ -5286,10 +5286,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -5307,8 +5307,8 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5357,24 +5357,24 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
@@ -5382,7 +5382,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v11, v1
; GFX8-NEXT: v_mov_b32_e32 v13, v1
; GFX8-NEXT: v_mov_b32_e32 v15, v1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -5457,10 +5457,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i1_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v12, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
@@ -5482,10 +5482,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v18
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5541,24 +5541,24 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v8i1_to_v8i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 6, v0
@@ -5646,10 +5646,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i1_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v16, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v3, 6, v1
; GFX12-NEXT: v_lshrrev_b16 v5, 7, v1
@@ -5675,10 +5675,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5750,21 +5750,21 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v6, v2
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s5
-; GFX8-NEXT: v_mov_b32_e32 v22, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v22, s2
; GFX8-NEXT: v_mov_b32_e32 v9, v2
; GFX8-NEXT: v_mov_b32_e32 v11, v2
; GFX8-NEXT: v_mov_b32_e32 v12, v2
@@ -5781,49 +5781,49 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8]
-; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
; GFX8-NEXT: v_mov_b32_e32 v23, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0
; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_and_b32_e32 v11, 1, v6
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[11:14]
-; GFX8-NEXT: s_add_u32 s0, s0, 16
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v14, 4, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 1, v6
@@ -5832,13 +5832,13 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v16
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_mov_b32_e32 v12, s0
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0
@@ -5934,10 +5934,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v28, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0
@@ -5981,14 +5981,14 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38
; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[28:31], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6073,40 +6073,40 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v16i1_to_v16i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v10, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s3
-; GFX8-NEXT: v_mov_b32_e32 v15, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s3
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v21, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v20, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v16, s1
+; GFX8-NEXT: v_mov_b32_e32 v15, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s5
; GFX8-NEXT: v_mov_b32_e32 v27, s1
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
+; GFX8-NEXT: v_mov_b32_e32 v20, s4
; GFX8-NEXT: v_mov_b32_e32 v26, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0
@@ -6267,10 +6267,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i1_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v32, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v32, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v3, 14, v1
; GFX12-NEXT: v_lshrrev_b16 v5, 15, v1
@@ -6320,14 +6320,14 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6447,86 +6447,86 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s0
; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s0
; GFX8-NEXT: v_and_b32_e32 v11, 1, v2
; GFX8-NEXT: v_and_b32_e32 v2, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT: s_lshr_b32 s14, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s0
+; GFX8-NEXT: s_lshr_b32 s14, s0, 24
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
-; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018
-; GFX8-NEXT: s_and_b32 s11, s2, 1
-; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s0
+; GFX8-NEXT: s_bfe_u32 s10, s0, 0x10018
+; GFX8-NEXT: s_and_b32 s11, s0, 1
+; GFX8-NEXT: s_bfe_u32 s15, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s16, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s17, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s18, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s19, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s20, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s21, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s22, s0, 0x10017
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0xa0
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x90
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x80
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0x70
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s4, 0xa0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x90
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: s_add_u32 s8, s4, 0x80
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0x70
; GFX8-NEXT: v_and_b32_e32 v16, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_and_b32_e32 v25, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0xf0
+; GFX8-NEXT: s_add_u32 s12, s4, 0xf0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
; GFX8-NEXT: v_mov_b32_e32 v19, v1
; GFX8-NEXT: v_mov_b32_e32 v21, v1
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s14
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x60
+; GFX8-NEXT: s_add_u32 s12, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v18, 1, v6
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 7, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s14
; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v15
; GFX8-NEXT: v_mov_b32_e32 v15, s13
; GFX8-NEXT: v_mov_b32_e32 v14, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 64
+; GFX8-NEXT: s_add_u32 s12, s4, 64
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[18:21]
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v9
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v11
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
@@ -6539,17 +6539,17 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v2
; GFX8-NEXT: v_and_b32_e32 v18, 1, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NEXT: v_and_b32_e32 v21, 1, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NEXT: v_mov_b32_e32 v8, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s19
; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v8, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s17
@@ -6557,62 +6557,62 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NEXT: s_add_u32 s0, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_and_b32_e32 v15, 1, v24
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_mov_b32_e32 v22, v1
; GFX8-NEXT: v_mov_b32_e32 v24, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v5
; GFX8-NEXT: v_mov_b32_e32 v21, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX8-NEXT: v_and_b32_e32 v9, 1, v12
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v12, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v2, v14
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v16, v1
; GFX8-NEXT: v_mov_b32_e32 v18, v1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s14
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v4
; GFX8-NEXT: v_and_b32_e32 v4, 1, v26
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v25
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -6783,71 +6783,71 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v0, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 11, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v2, 12, s0
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v23, 1, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v4, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v11, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v11, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s0
; GFX12-NEXT: v_and_b32_e32 v24, 1, v4
; GFX12-NEXT: v_and_b32_e32 v25, 1, v8
; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
+; GFX12-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_and_b32 v21, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10015
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s2
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10014
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s0
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:176
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10012
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v6, 10, s0
; GFX12-NEXT: v_and_b32_e32 v26, 1, v15
; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9
; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:160
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 2, s0
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s3, s0, 1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10010
; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
; GFX12-NEXT: v_and_b32_e32 v4, 1, v14
; GFX12-NEXT: v_and_b32_e32 v8, 1, v12
@@ -6856,9 +6856,9 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v16
; GFX12-NEXT: v_and_b32_e32 v39, 1, v7
; GFX12-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:128
; GFX12-NEXT: v_mov_b32_e32 v5, v1
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9
; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22
; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20
@@ -6870,26 +6870,26 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v25
; GFX12-NEXT: v_mov_b32_e32 v24, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[19:22], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[15:18], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[35:38], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[39:42], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[19:22], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[15:18], s[4:5] offset:64
; GFX12-NEXT: v_mov_b32_e32 v15, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28
; GFX12-NEXT: v_mov_b32_e32 v28, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7067,43 +7067,43 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v32i1_to_v32i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s6, s4, 22
-; GFX8-NEXT: s_lshr_b32 s8, s4, 23
-; GFX8-NEXT: s_lshr_b32 s10, s4, 20
-; GFX8-NEXT: s_lshr_b32 s12, s4, 21
-; GFX8-NEXT: s_lshr_b32 s14, s4, 18
-; GFX8-NEXT: s_lshr_b32 s16, s4, 19
-; GFX8-NEXT: s_lshr_b32 s18, s4, 16
-; GFX8-NEXT: s_lshr_b32 s20, s4, 17
-; GFX8-NEXT: s_lshr_b32 s2, s4, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 15, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 13, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 10, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 11, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 8, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 9, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 1, s2
+; GFX8-NEXT: s_lshr_b32 s6, s2, 22
+; GFX8-NEXT: s_lshr_b32 s8, s2, 23
+; GFX8-NEXT: s_lshr_b32 s10, s2, 20
+; GFX8-NEXT: s_lshr_b32 s12, s2, 21
+; GFX8-NEXT: s_lshr_b32 s14, s2, 18
+; GFX8-NEXT: s_lshr_b32 s16, s2, 19
+; GFX8-NEXT: s_lshr_b32 s18, s2, 16
+; GFX8-NEXT: s_lshr_b32 s20, s2, 17
+; GFX8-NEXT: s_lshr_b32 s0, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 15, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 10, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 8, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 1, s0
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
@@ -7113,33 +7113,33 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX8-NEXT: v_mov_b32_e32 v21, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s6, s4, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v22, s7
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0xa0
+; GFX8-NEXT: s_add_u32 s6, s4, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v23, s8
; GFX8-NEXT: v_mov_b32_e32 v24, s9
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x90
+; GFX8-NEXT: s_add_u32 s6, s4, 0x90
; GFX8-NEXT: v_mov_b32_e32 v21, s10
; GFX8-NEXT: v_mov_b32_e32 v22, s11
; GFX8-NEXT: v_mov_b32_e32 v23, s12
; GFX8-NEXT: v_mov_b32_e32 v24, s13
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x80
+; GFX8-NEXT: s_add_u32 s6, s4, 0x80
; GFX8-NEXT: v_mov_b32_e32 v21, s14
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s16
; GFX8-NEXT: v_mov_b32_e32 v24, s17
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v21, s18
@@ -7147,15 +7147,15 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v23, s20
; GFX8-NEXT: v_mov_b32_e32 v24, s21
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x70
+; GFX8-NEXT: s_add_u32 s6, s4, 0x70
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v23, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v2, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x60
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x60
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_mov_b32_e32 v26, s7
@@ -7163,29 +7163,29 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v25, s6
; GFX8-NEXT: v_bfe_i32 v23, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v15, 0, 1
-; GFX8-NEXT: s_add_u32 s6, s0, 0x50
+; GFX8-NEXT: s_add_u32 s6, s4, 0x50
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_bfe_i32 v25, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v13, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v14, s7
; GFX8-NEXT: v_mov_b32_e32 v13, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 64
+; GFX8-NEXT: s_add_u32 s6, s4, 64
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[23:26]
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v25, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v10, s7
; GFX8-NEXT: v_mov_b32_e32 v9, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 48
+; GFX8-NEXT: s_add_u32 s6, s4, 48
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[23:26]
; GFX8-NEXT: v_bfe_i32 v10, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v25, v8, 0, 1
@@ -7194,18 +7194,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 32
+; GFX8-NEXT: s_add_u32 s6, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[23:26]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v25, v5, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v4, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 16
+; GFX8-NEXT: s_add_u32 s6, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[23:26]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v25, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v0, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -7214,44 +7214,44 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_bfe_i32 v6, v7, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_add_u32 s4, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_add_u32 s2, s4, 0xf0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xd0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xe0
; GFX8-NEXT: v_bfe_i32 v17, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v16, 0, 1
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xd0
; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_bfe_i32 v2, v27, 0, 1
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_bfe_i32 v2, v27, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -7448,42 +7448,42 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i1_to_v32i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v26, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v28, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 3, s2
-; GFX12-NEXT: s_lshr_b32 s22, s2, 24
-; GFX12-NEXT: s_lshr_b32 s12, s2, 22
-; GFX12-NEXT: s_lshr_b32 s14, s2, 23
-; GFX12-NEXT: v_lshrrev_b16 v6, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 1, s2
-; GFX12-NEXT: s_lshr_b32 s16, s2, 20
-; GFX12-NEXT: s_lshr_b32 s18, s2, 21
-; GFX12-NEXT: v_lshrrev_b16 v1, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v26, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v28, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v4, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 3, s0
+; GFX12-NEXT: s_lshr_b32 s22, s0, 24
+; GFX12-NEXT: s_lshr_b32 s12, s0, 22
+; GFX12-NEXT: s_lshr_b32 s14, s0, 23
+; GFX12-NEXT: v_lshrrev_b16 v6, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, s0
+; GFX12-NEXT: s_lshr_b32 s16, s0, 20
+; GFX12-NEXT: s_lshr_b32 s18, s0, 21
+; GFX12-NEXT: v_lshrrev_b16 v1, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 15, s0
; GFX12-NEXT: v_lshrrev_b16 v12, 6, s22
; GFX12-NEXT: v_lshrrev_b16 v14, 7, s22
-; GFX12-NEXT: v_lshrrev_b16 v9, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s0
; GFX12-NEXT: v_lshrrev_b16 v16, 4, s22
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s22
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT: s_lshr_b32 s4, s2, 18
-; GFX12-NEXT: v_lshrrev_b16 v37, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
+; GFX12-NEXT: s_lshr_b32 s2, s0, 18
+; GFX12-NEXT: v_lshrrev_b16 v37, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s0
; GFX12-NEXT: v_lshrrev_b16 v13, 2, s22
; GFX12-NEXT: v_lshrrev_b16 v15, 3, s22
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v49, s12
-; GFX12-NEXT: v_lshrrev_b16 v30, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v32, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v30, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v32, 9, s0
; GFX12-NEXT: v_lshrrev_b16 v11, 1, s22
; GFX12-NEXT: v_bfe_i32 v7, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1
@@ -7491,15 +7491,15 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v29, v26, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s14
; GFX12-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s16
-; GFX12-NEXT: s_lshr_b32 s6, s2, 19
+; GFX12-NEXT: s_lshr_b32 s6, s0, 19
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX12-NEXT: v_bfe_i32 v27, v8, 0, 1
; GFX12-NEXT: v_bfe_i32 v25, v6, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v54, s17 :: v_dual_mov_b32 v55, s18
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v56, s19
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
-; GFX12-NEXT: s_lshr_b32 s20, s2, 17
+; GFX12-NEXT: s_lshr_b32 s10, s0, 16
+; GFX12-NEXT: s_lshr_b32 s20, s0, 17
; GFX12-NEXT: v_bfe_i32 v23, v14, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 1
; GFX12-NEXT: v_bfe_i32 v47, v2, 0, 1
@@ -7509,7 +7509,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v43, v10, 0, 1
; GFX12-NEXT: v_bfe_i32 v41, v9, 0, 1
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[0:1], 0x10000
; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1
; GFX12-NEXT: v_bfe_i32 v39, v34, 0, 1
@@ -7524,18 +7524,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v33, v30, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v30, 31, v29
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:160
-; GFX12-NEXT: v_dual_mov_b32 v49, s4 :: v_dual_mov_b32 v50, s5
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:160
+; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s3
; GFX12-NEXT: v_dual_mov_b32 v51, s6 :: v_dual_mov_b32 v52, s7
; GFX12-NEXT: v_mov_b32_e32 v53, s10
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[22:23], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[22:23], 0x10000
; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX12-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX12-NEXT: v_dual_mov_b32 v54, s11 :: v_dual_mov_b32 v55, s20
; GFX12-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v1, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s0
; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX12-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX12-NEXT: v_ashrrev_i32_e32 v48, 31, v47
@@ -7552,22 +7552,22 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v36, 31, v35
; GFX12-NEXT: v_ashrrev_i32_e32 v34, 31, v33
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v10, s3
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v10, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
-; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7783,159 +7783,159 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s0
; GFX8-NEXT: v_and_b32_e32 v16, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s0
; GFX8-NEXT: v_and_b32_e32 v15, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s0
; GFX8-NEXT: v_and_b32_e32 v13, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2
-; GFX8-NEXT: s_lshr_b32 s33, s3, 24
-; GFX8-NEXT: s_lshr_b32 s24, s2, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s0
+; GFX8-NEXT: s_lshr_b32 s33, s1, 24
+; GFX8-NEXT: s_lshr_b32 s24, s0, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s22, s3, 1
-; GFX8-NEXT: s_and_b32 s23, s2, 1
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2
-; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s31, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s35, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s36, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s37, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s38, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s39, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s40, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s41, s3, 0x10014
-; GFX8-NEXT: s_add_u32 s4, s0, 0x1a0
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1b0
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x190
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s10, s0, 0x180
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0xb0
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: s_add_u32 s14, s0, 0xa0
-; GFX8-NEXT: s_addc_u32 s15, s1, 0
-; GFX8-NEXT: s_add_u32 s16, s0, 0x90
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
-; GFX8-NEXT: s_add_u32 s18, s0, 0x80
-; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x70
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s0
+; GFX8-NEXT: s_bfe_u32 s20, s0, 0x10018
+; GFX8-NEXT: s_bfe_u32 s21, s1, 0x10018
+; GFX8-NEXT: s_and_b32 s22, s1, 1
+; GFX8-NEXT: s_and_b32 s23, s0, 1
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s0
+; GFX8-NEXT: s_bfe_u32 s25, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s26, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s27, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s28, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s29, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s30, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s31, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x10017
+; GFX8-NEXT: s_bfe_u32 s34, s1, 0x10011
+; GFX8-NEXT: s_bfe_u32 s35, s1, 0x10010
+; GFX8-NEXT: s_bfe_u32 s36, s1, 0x10012
+; GFX8-NEXT: s_bfe_u32 s37, s1, 0x10013
+; GFX8-NEXT: s_bfe_u32 s38, s1, 0x10016
+; GFX8-NEXT: s_bfe_u32 s39, s1, 0x10017
+; GFX8-NEXT: s_bfe_u32 s40, s1, 0x10015
+; GFX8-NEXT: s_bfe_u32 s41, s1, 0x10014
+; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1b0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: s_add_u32 s8, s4, 0x190
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-NEXT: s_add_u32 s10, s4, 0x180
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0xb0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
+; GFX8-NEXT: s_add_u32 s14, s4, 0xa0
+; GFX8-NEXT: s_addc_u32 s15, s5, 0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x90
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
+; GFX8-NEXT: s_add_u32 s18, s4, 0x80
+; GFX8-NEXT: s_addc_u32 s19, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s1
+; GFX8-NEXT: s_add_u32 s42, s4, 0x70
; GFX8-NEXT: v_and_b32_e32 v7, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v23, s42
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v24, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x170
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x170
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s1
; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s1
; GFX8-NEXT: v_mov_b32_e32 v23, v1
; GFX8-NEXT: v_mov_b32_e32 v25, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s1
+; GFX8-NEXT: s_add_u32 s42, s4, 0x1f0
; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s33
; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v21, 1, v21
; GFX8-NEXT: v_lshrrev_b16_e64 v23, 7, s33
; GFX8-NEXT: v_mov_b32_e32 v22, v1
; GFX8-NEXT: v_mov_b32_e32 v24, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0xf0
+; GFX8-NEXT: s_add_u32 s42, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 7, s24
; GFX8-NEXT: v_mov_b32_e32 v23, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x60
+; GFX8-NEXT: s_add_u32 s42, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v19
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX8-NEXT: v_mov_b32_e32 v18, s42
; GFX8-NEXT: v_mov_b32_e32 v19, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x50
+; GFX8-NEXT: s_add_u32 s42, s4, 0x50
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v17
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v16
; GFX8-NEXT: v_mov_b32_e32 v16, s42
; GFX8-NEXT: v_mov_b32_e32 v17, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 64
+; GFX8-NEXT: s_add_u32 s42, s4, 64
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v17, 1
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v26, s42
; GFX8-NEXT: v_and_b32_sdwa v22, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v15
; GFX8-NEXT: v_mov_b32_e32 v27, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 48
+; GFX8-NEXT: s_add_u32 s42, s4, 48
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v14
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v13
; GFX8-NEXT: v_mov_b32_e32 v13, s42
; GFX8-NEXT: v_mov_b32_e32 v14, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 32
+; GFX8-NEXT: s_add_u32 s42, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v11
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v10
; GFX8-NEXT: v_mov_b32_e32 v10, s42
; GFX8-NEXT: v_mov_b32_e32 v11, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 16
+; GFX8-NEXT: s_add_u32 s42, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v9
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v8
; GFX8-NEXT: v_mov_b32_e32 v8, s42
; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x160
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x160
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s33
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25]
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s33
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v22, s42
; GFX8-NEXT: v_and_b32_e32 v28, 1, v10
; GFX8-NEXT: v_and_b32_e32 v19, 1, v8
@@ -7945,12 +7945,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v11, v1
; GFX8-NEXT: v_mov_b32_e32 v23, s43
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x150
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x150
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
; GFX8-NEXT: v_and_b32_e32 v22, 1, v5
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s42
; GFX8-NEXT: v_and_b32_e32 v7, 1, v21
; GFX8-NEXT: v_mov_b32_e32 v8, v1
@@ -7958,28 +7958,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[7:10]
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s1
; GFX8-NEXT: v_and_b32_e32 v10, 1, v4
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x140
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x140
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
; GFX8-NEXT: v_and_b32_e32 v20, 1, v2
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v8, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s1
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v4
; GFX8-NEXT: v_and_b32_sdwa v4, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v20
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x130
+; GFX8-NEXT: s_add_u32 s42, s4, 0x130
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s1
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v7, 1, v18
; GFX8-NEXT: v_mov_b32_e32 v17, s42
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
@@ -7988,25 +7988,25 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v18, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x120
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x120
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s1
; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10]
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v3
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v19
; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
; GFX8-NEXT: v_mov_b32_e32 v17, v1
; GFX8-NEXT: v_mov_b32_e32 v19, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x110
+; GFX8-NEXT: s_add_u32 s42, s4, 0x110
; GFX8-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s1
; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v15
@@ -8015,13 +8015,13 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v20, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v13
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s41
; GFX8-NEXT: v_mov_b32_e32 v2, s40
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v13, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s38
@@ -8040,7 +8040,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s31
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v13, s13
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v12, s14
@@ -8058,66 +8058,66 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NEXT: v_mov_b32_e32 v13, s19
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
-; GFX8-NEXT: s_add_u32 s2, s0, 0x100
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
+; GFX8-NEXT: s_add_u32 s0, s4, 0x100
; GFX8-NEXT: v_mov_b32_e32 v0, s23
; GFX8-NEXT: v_mov_b32_e32 v2, v10
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1e0
; GFX8-NEXT: v_mov_b32_e32 v0, s22
; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_and_b32_e32 v26, 1, v14
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v27, 4, s33
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1d0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v27
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v26
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1c0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s33
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v28
; GFX8-NEXT: v_mov_b32_e32 v15, v1
; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17]
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v2, v5
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s24
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xd0
; GFX8-NEXT: v_and_b32_e32 v7, 1, v23
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v22
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[7:10]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
; GFX8-NEXT: v_and_b32_e32 v4, 1, v21
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -8434,58 +8434,58 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s0
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
+; GFX12-NEXT: v_lshrrev_b16 v14, 13, s1
; GFX12-NEXT: v_and_b32_e32 v34, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 9, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v41, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v3, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v24, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v4, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 3, s2
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v3, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v24, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s1
; GFX12-NEXT: v_and_b32_e32 v50, 1, v14
; GFX12-NEXT: v_and_b32_e32 v47, 1, v18
; GFX12-NEXT: v_and_b32_e32 v18, 1, v4
; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
-; GFX12-NEXT: v_lshrrev_b16 v0, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 1, s0
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-NEXT: v_and_b32_e32 v42, 1, v8
; GFX12-NEXT: v_and_b32_e32 v52, 1, v10
; GFX12-NEXT: v_and_b32_e32 v40, 1, v23
; GFX12-NEXT: v_dual_mov_b32 v44, v1 :: v_dual_and_b32 v43, 1, v24
-; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
-; GFX12-NEXT: v_lshrrev_b16 v10, 2, s5
-; GFX12-NEXT: v_lshrrev_b16 v24, 4, s5
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v8, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v24, 4, s3
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x10014
; GFX12-NEXT: v_and_b32_e32 v33, 1, v25
; GFX12-NEXT: v_and_b32_e32 v25, 1, v6
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10015
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10015
; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 11, s3
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v16, 11, s1
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v35, 1, v5
; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v5, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v36, 7, s5
-; GFX12-NEXT: v_lshrrev_b16 v37, 6, s5
+; GFX12-NEXT: v_lshrrev_b16 v36, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v37, 6, s3
; GFX12-NEXT: v_and_b32_e32 v56, 1, v8
; GFX12-NEXT: v_and_b32_e32 v4, 1, v10
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v23
@@ -8494,16 +8494,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v23, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v24, v1 :: v_dual_and_b32 v25, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v3
-; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
+; GFX12-NEXT: s_bfe_u32 s9, s1, 0x10013
; GFX12-NEXT: v_and_b32_e32 v27, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v9, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 6, s0
; GFX12-NEXT: v_and_b32_e32 v22, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v54, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v55, 1, s4
+; GFX12-NEXT: v_lshrrev_b16 v54, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v55, 1, s2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:96
; GFX12-NEXT: v_and_b32_e32 v23, 1, v37
; GFX12-NEXT: v_and_b32_e32 v25, 0xffff, v36
; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v28, 0xffff, v34
@@ -8512,91 +8512,91 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v37, v1 :: v_dual_and_b32 v26, 1, v9
; GFX12-NEXT: v_mov_b32_e32 v27, v1
; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v0, 1, v55
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[4:5] offset:64
; GFX12-NEXT: v_and_b32_e32 v34, 1, v13
; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v41
; GFX12-NEXT: v_and_b32_e32 v2, 1, v54
-; GFX12-NEXT: global_store_b128 v1, v[26:29], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[26:29], s[4:5] offset:80
; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s7
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[4:5] offset:48
; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v2
; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 14, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 14, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:416
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10012
-; GFX12-NEXT: v_lshrrev_b16 v19, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v32, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v19, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v32, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:432
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_mov_b32_e32 v2, s9
-; GFX12-NEXT: v_lshrrev_b16 v39, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v28, 10, s3
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10011
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v39, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v28, 10, s1
+; GFX12-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX12-NEXT: s_and_b32 s6, s1, 1
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10011
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x10010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:400
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s2
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s0
; GFX12-NEXT: v_and_b32_e32 v31, 1, v31
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:384
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10015
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10014
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v43
; GFX12-NEXT: v_and_b32_e32 v41, 1, v15
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:176
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v17, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v46, 7, s4
-; GFX12-NEXT: v_lshrrev_b16 v49, 6, s4
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10012
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v17, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v46, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v49, 6, s2
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v43, 0xffff, v42
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v45, 1, v32
; GFX12-NEXT: v_and_b32_e32 v47, 0xffff, v47
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:160
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 4, s4
-; GFX12-NEXT: v_lshrrev_b16 v12, 2, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 2, s2
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s7, s0, 1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10010
; GFX12-NEXT: v_and_b32_e32 v51, 1, v17
; GFX12-NEXT: v_dual_mov_b32 v54, v1 :: v_dual_and_b32 v53, 0xffff, v52
; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: v_mov_b32_e32 v52, v1
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[4:5] offset:32
; GFX12-NEXT: v_and_b32_e32 v41, 1, v49
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v46
; GFX12-NEXT: v_mov_b32_e32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v56
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:128
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: v_mov_b32_e32 v46, v1
; GFX12-NEXT: v_mov_b32_e32 v2, v37
; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v16, 1, v16
; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v48, 1, v19
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX12-NEXT: global_store_b128 v1, v[51:54], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[51:54], s[4:5] offset:16
; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v52, 1, v21
; GFX12-NEXT: v_and_b32_e32 v54, 0xffff, v20
; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v50, 0xffff, v50
@@ -8608,40 +8608,40 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v56, 1, v28
; GFX12-NEXT: v_and_b32_e32 v58, 0xffff, v22
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:496
-; GFX12-NEXT: global_store_b128 v1, v[52:55], s[0:1] offset:368
-; GFX12-NEXT: global_store_b128 v1, v[48:51], s[0:1] offset:352
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[4:5] offset:496
+; GFX12-NEXT: global_store_b128 v1, v[52:55], s[4:5] offset:368
+; GFX12-NEXT: global_store_b128 v1, v[48:51], s[4:5] offset:352
; GFX12-NEXT: v_mov_b32_e32 v41, v1
; GFX12-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: v_mov_b32_e32 v2, v36
; GFX12-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_and_b32 v33, 0xffff, v33
; GFX12-NEXT: v_mov_b32_e32 v32, v1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[56:59], s[0:1] offset:336
-; GFX12-NEXT: global_store_b128 v1, v[45:48], s[0:1] offset:320
-; GFX12-NEXT: global_store_b128 v1, v[38:41], s[0:1] offset:304
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: global_store_b128 v1, v[56:59], s[4:5] offset:336
+; GFX12-NEXT: global_store_b128 v1, v[45:48], s[4:5] offset:320
+; GFX12-NEXT: global_store_b128 v1, v[38:41], s[4:5] offset:304
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:256
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, v30
; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v12
; GFX12-NEXT: v_mov_b32_e32 v15, v1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:272
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:480
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[4:5] offset:272
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[4:5] offset:480
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:448
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_dual_mov_b32 v2, v35 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v28, v1
; GFX12-NEXT: v_mov_b32_e32 v30, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:464
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:288
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:464
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:288
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -8977,13 +8977,13 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_mov_b32 s13, s7
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v29, s1
-; GFX8-NEXT: v_mov_b32_e32 v28, s0
+; GFX8-NEXT: v_mov_b32_e32 v29, s5
+; GFX8-NEXT: v_mov_b32_e32 v28, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s16, s11, 22
; GFX8-NEXT: s_lshr_b32 s18, s11, 23
@@ -9004,8 +9004,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_mov_b32 s6, s11
; GFX8-NEXT: s_lshr_b32 s12, s11, 24
; GFX8-NEXT: s_lshr_b32 s8, s10, 24
-; GFX8-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x10000
; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX8-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x10000
; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
@@ -9025,91 +9025,91 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX8-NEXT: v_mov_b32_e32 v22, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x1b0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x1b0
; GFX8-NEXT: v_mov_b32_e32 v23, s17
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x1a0
; GFX8-NEXT: v_mov_b32_e32 v24, s18
; GFX8-NEXT: v_mov_b32_e32 v25, s19
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x190
+; GFX8-NEXT: s_add_u32 s16, s4, 0x190
; GFX8-NEXT: v_mov_b32_e32 v22, s20
; GFX8-NEXT: v_mov_b32_e32 v23, s21
; GFX8-NEXT: v_mov_b32_e32 v24, s22
; GFX8-NEXT: v_mov_b32_e32 v25, s23
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x180
+; GFX8-NEXT: s_add_u32 s16, s4, 0x180
; GFX8-NEXT: v_mov_b32_e32 v22, s24
; GFX8-NEXT: v_mov_b32_e32 v23, s25
; GFX8-NEXT: v_mov_b32_e32 v24, s26
; GFX8-NEXT: v_mov_b32_e32 v25, s27
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s16, s4, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v22, s28
; GFX8-NEXT: v_mov_b32_e32 v23, s29
; GFX8-NEXT: v_mov_b32_e32 v24, s30
; GFX8-NEXT: v_mov_b32_e32 v25, s31
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0xa0
+; GFX8-NEXT: s_add_u32 s16, s4, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v22, s34
; GFX8-NEXT: v_mov_b32_e32 v23, s35
; GFX8-NEXT: v_mov_b32_e32 v24, s36
; GFX8-NEXT: v_mov_b32_e32 v25, s37
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x90
+; GFX8-NEXT: s_add_u32 s16, s4, 0x90
; GFX8-NEXT: v_mov_b32_e32 v22, s38
; GFX8-NEXT: v_mov_b32_e32 v23, s39
; GFX8-NEXT: v_mov_b32_e32 v24, s40
; GFX8-NEXT: v_mov_b32_e32 v25, s41
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x80
+; GFX8-NEXT: s_add_u32 s16, s4, 0x80
; GFX8-NEXT: v_mov_b32_e32 v22, s42
; GFX8-NEXT: v_mov_b32_e32 v23, s43
; GFX8-NEXT: v_mov_b32_e32 v24, s44
; GFX8-NEXT: v_mov_b32_e32 v25, s45
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x70
+; GFX8-NEXT: s_add_u32 s16, s4, 0x70
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s10
; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s10
; GFX8-NEXT: v_mov_b32_e32 v22, s46
; GFX8-NEXT: v_mov_b32_e32 v23, s47
; GFX8-NEXT: v_mov_b32_e32 v24, s48
; GFX8-NEXT: v_mov_b32_e32 v25, s49
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_bfe_i32 v26, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v20, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v21, s17
; GFX8-NEXT: v_mov_b32_e32 v20, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x60
+; GFX8-NEXT: s_add_u32 s16, s4, 0x60
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s10
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[24:27]
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s10
; GFX8-NEXT: v_bfe_i32 v26, v19, 0, 1
@@ -9119,9 +9119,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v18, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x50
+; GFX8-NEXT: s_add_u32 s16, s4, 0x50
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[24:27]
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v16, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v16, s16
@@ -9137,7 +9137,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v17, s17
-; GFX8-NEXT: s_add_u32 s10, s0, 64
+; GFX8-NEXT: s_add_u32 s10, s4, 64
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s11
@@ -9154,15 +9154,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[24:27]
; GFX8-NEXT: v_lshrrev_b16_e64 v17, 3, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 1, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v14, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v15, s11
; GFX8-NEXT: v_mov_b32_e32 v14, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27]
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v26, v13, 0, 1
@@ -9171,18 +9171,18 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v12, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 32
+; GFX8-NEXT: s_add_u32 s10, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v10, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v10, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v11, s11
-; GFX8-NEXT: s_add_u32 s10, s0, 16
+; GFX8-NEXT: s_add_u32 s10, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v9, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v8, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v8, s10
@@ -9190,32 +9190,32 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[24:27]
-; GFX8-NEXT: s_add_u32 s10, s0, 0x170
+; GFX8-NEXT: s_add_u32 s10, s4, 0x170
; GFX8-NEXT: v_bfe_i32 v26, v7, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_mov_b32_e32 v24, s14
; GFX8-NEXT: v_mov_b32_e32 v25, s15
; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v5, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s10
; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: s_add_u32 s10, s0, 0x160
+; GFX8-NEXT: s_add_u32 s10, s4, 0x160
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[24:27]
; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v25, s11
; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX8-NEXT: v_mov_b32_e32 v24, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x150
+; GFX8-NEXT: s_add_u32 s10, s4, 0x150
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[3:6]
; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
@@ -9228,39 +9228,39 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e64 v25, 3, s8
; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s8
-; GFX8-NEXT: s_add_u32 s8, s0, 0x140
+; GFX8-NEXT: s_add_u32 s8, s4, 0x140
; GFX8-NEXT: v_bfe_i32 v2, v23, 0, 1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x130
+; GFX8-NEXT: s_add_u32 s8, s4, 0x130
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_bfe_i32 v4, v22, 0, 1
; GFX8-NEXT: v_bfe_i32 v2, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v6, v21, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x120
+; GFX8-NEXT: s_add_u32 s8, s4, 0x120
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: v_bfe_i32 v21, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v20, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x110
+; GFX8-NEXT: s_add_u32 s8, s4, 0x110
; GFX8-NEXT: v_bfe_i32 v6, v25, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
; GFX8-NEXT: v_bfe_i32 v25, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v18, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_bfe_i32 v4, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v26, 0, 1
@@ -9270,31 +9270,31 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s12
; GFX8-NEXT: v_mov_b32_e32 v23, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x100
+; GFX8-NEXT: s_add_u32 s6, s4, 0x100
; GFX8-NEXT: v_bfe_i32 v25, v16, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v24, s7
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s12
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1f0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1f0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
; GFX8-NEXT: v_bfe_i32 v16, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s12
; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s12
; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1e0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1e0
; GFX8-NEXT: v_bfe_i32 v21, v27, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
; GFX8-NEXT: v_bfe_i32 v29, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v27, v12, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s12
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s12
@@ -9302,12 +9302,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v30, 31, v29
; GFX8-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1d0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1d0
; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[27:30]
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v10, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9
@@ -9315,41 +9315,41 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s12
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
; GFX8-NEXT: v_bfe_i32 v14, v8, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v12, s4
-; GFX8-NEXT: s_add_u32 s4, s0, 0x1c0
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xf0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xd0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xd0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -9724,115 +9724,115 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s19, s5
+; GFX12-NEXT: s_mov_b32 s19, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s26, s3, 22
-; GFX12-NEXT: s_lshr_b32 s28, s3, 23
-; GFX12-NEXT: s_lshr_b32 s30, s3, 20
-; GFX12-NEXT: s_lshr_b32 s34, s3, 21
+; GFX12-NEXT: s_lshr_b32 s26, s1, 22
+; GFX12-NEXT: s_lshr_b32 s28, s1, 23
+; GFX12-NEXT: s_lshr_b32 s30, s1, 20
+; GFX12-NEXT: s_lshr_b32 s34, s1, 21
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX12-NEXT: s_lshr_b32 s20, s3, 18
+; GFX12-NEXT: s_lshr_b32 s20, s1, 18
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v43, s27
; GFX12-NEXT: v_dual_mov_b32 v42, s26 :: v_dual_mov_b32 v45, s29
; GFX12-NEXT: v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31
-; GFX12-NEXT: s_lshr_b32 s22, s3, 19
+; GFX12-NEXT: s_lshr_b32 s22, s1, 19
; GFX12-NEXT: v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s34
-; GFX12-NEXT: s_lshr_b32 s24, s3, 16
-; GFX12-NEXT: s_lshr_b32 s36, s3, 17
+; GFX12-NEXT: s_lshr_b32 s24, s1, 16
+; GFX12-NEXT: s_lshr_b32 s36, s1, 17
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX12-NEXT: s_lshr_b32 s12, s2, 22
+; GFX12-NEXT: s_lshr_b32 s12, s0, 22
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:432
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:416
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:432
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:416
; GFX12-NEXT: v_dual_mov_b32 v43, s21 :: v_dual_mov_b32 v42, s20
; GFX12-NEXT: v_dual_mov_b32 v45, s23 :: v_dual_mov_b32 v44, s22
; GFX12-NEXT: v_mov_b32_e32 v47, s25
-; GFX12-NEXT: s_lshr_b32 s14, s2, 23
+; GFX12-NEXT: s_lshr_b32 s14, s0, 23
; GFX12-NEXT: v_dual_mov_b32 v46, s24 :: v_dual_mov_b32 v49, s37
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s36
-; GFX12-NEXT: s_lshr_b32 s16, s2, 20
-; GFX12-NEXT: s_lshr_b32 s40, s2, 21
+; GFX12-NEXT: s_lshr_b32 s16, s0, 20
+; GFX12-NEXT: s_lshr_b32 s40, s0, 21
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT: s_lshr_b32 s6, s2, 18
+; GFX12-NEXT: s_lshr_b32 s6, s0, 18
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:400
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:384
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:400
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:384
; GFX12-NEXT: v_dual_mov_b32 v43, s13 :: v_dual_mov_b32 v42, s12
; GFX12-NEXT: v_dual_mov_b32 v45, s15 :: v_dual_mov_b32 v44, s14
; GFX12-NEXT: v_mov_b32_e32 v47, s17
-; GFX12-NEXT: s_lshr_b32 s8, s2, 19
+; GFX12-NEXT: s_lshr_b32 s8, s0, 19
; GFX12-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v49, s41
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s40
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-NEXT: s_lshr_b32 s10, s0, 16
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT: v_lshrrev_b16 v3, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 13, s0
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:160
; GFX12-NEXT: v_dual_mov_b32 v43, s7 :: v_dual_mov_b32 v42, s6
; GFX12-NEXT: v_dual_mov_b32 v45, s9 :: v_dual_mov_b32 v44, s8
; GFX12-NEXT: v_mov_b32_e32 v47, s11
-; GFX12-NEXT: s_lshr_b32 s42, s2, 17
-; GFX12-NEXT: v_lshrrev_b16 v32, 10, s2
+; GFX12-NEXT: s_lshr_b32 s42, s0, 17
+; GFX12-NEXT: v_lshrrev_b16 v32, 10, s0
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v35, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v27, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v29, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v30, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v31, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v24, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v25, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v23, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v18, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v0, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v1, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v28, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v36, 1, s3
-; GFX12-NEXT: s_lshr_b32 s18, s3, 24
-; GFX12-NEXT: s_mov_b32 s4, s3
-; GFX12-NEXT: s_lshr_b32 s38, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v35, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v27, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v29, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v30, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v31, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v24, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v18, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v0, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v1, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v15, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v28, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v36, 1, s1
+; GFX12-NEXT: s_lshr_b32 s18, s1, 24
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_lshr_b32 s38, s0, 24
; GFX12-NEXT: v_dual_mov_b32 v46, s10 :: v_dual_mov_b32 v49, s43
; GFX12-NEXT: v_bfe_i32 v52, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v50, v3, 0, 1
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s42
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:144
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:144
; GFX12-NEXT: v_bfe_i32 v44, v9, 0, 1
; GFX12-NEXT: v_bfe_i32 v42, v7, 0, 1
; GFX12-NEXT: v_lshrrev_b16 v41, 2, s18
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:128
; GFX12-NEXT: v_lshrrev_b16 v54, 3, s18
; GFX12-NEXT: v_lshrrev_b16 v56, 6, s38
; GFX12-NEXT: v_ashrrev_i32_e32 v53, 31, v52
@@ -9841,9 +9841,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42
; GFX12-NEXT: v_bfe_i32 v46, v56, 0, 1
; GFX12-NEXT: v_bfe_i32 v56, v54, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[4:5] offset:112
; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:96
; GFX12-NEXT: v_bfe_i32 v32, v32, 0, 1
; GFX12-NEXT: v_bfe_i32 v54, v41, 0, 1
; GFX12-NEXT: v_bfe_i32 v43, v35, 0, 1
@@ -9855,9 +9855,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41
; GFX12-NEXT: v_lshrrev_b16 v40, 5, s18
; GFX12-NEXT: v_lshrrev_b16 v37, 6, s18
-; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[4:5] offset:80
; GFX12-NEXT: v_bfe_i32 v32, v39, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[4:5] offset:64
; GFX12-NEXT: v_bfe_i32 v41, v29, 0, 1
; GFX12-NEXT: v_bfe_i32 v39, v27, 0, 1
; GFX12-NEXT: v_bfe_i32 v34, v40, 0, 1
@@ -9869,23 +9869,23 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v62, v37, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v61, 31, v60
; GFX12-NEXT: v_ashrrev_i32_e32 v59, 31, v58
-; GFX12-NEXT: global_store_b128 v12, v[39:42], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v12, v[39:42], s[4:5] offset:48
; GFX12-NEXT: v_bfe_i32 v39, v25, 0, 1
; GFX12-NEXT: v_bfe_i32 v37, v24, 0, 1
; GFX12-NEXT: v_bfe_i32 v64, v38, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[58:61], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v12, v[58:61], s[4:5] offset:32
; GFX12-NEXT: v_bfe_i32 v43, v23, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39
; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX12-NEXT: v_bfe_i32 v24, v36, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43
-; GFX12-NEXT: v_dual_mov_b32 v41, s2 :: v_dual_mov_b32 v42, s3
-; GFX12-NEXT: v_mov_b32_e32 v23, s5
-; GFX12-NEXT: global_store_b128 v12, v[37:40], s[0:1] offset:16
+; GFX12-NEXT: v_dual_mov_b32 v41, s0 :: v_dual_mov_b32 v42, s1
+; GFX12-NEXT: v_mov_b32_e32 v23, s3
+; GFX12-NEXT: global_store_b128 v12, v[37:40], s[4:5] offset:16
; GFX12-NEXT: v_bfe_i32 v38, v20, 0, 1
; GFX12-NEXT: v_bfe_i32 v36, v18, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1]
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[4:5]
; GFX12-NEXT: v_bfe_i32 v20, v19, 0, 1
; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38
@@ -9901,8 +9901,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v11, 4, s38
; GFX12-NEXT: v_lshrrev_b16 v2, 1, s38
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:368
-; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:352
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[4:5] offset:368
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[4:5] offset:352
; GFX12-NEXT: v_bfe_i32 v38, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v36, v0, 0, 1
; GFX12-NEXT: v_bfe_i32 v52, v55, 0, 1
@@ -9932,7 +9932,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32
; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20
; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT: v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v51, s9
+; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v51, s9
; GFX12-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_ashrrev_i32_e32 v49, 31, v48
; GFX12-NEXT: v_ashrrev_i32_e32 v47, 31, v46
@@ -9949,22 +9949,22 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:336
-; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:320
-; GFX12-NEXT: global_store_b128 v12, v[14:17], s[0:1] offset:304
-; GFX12-NEXT: global_store_b128 v12, v[40:43], s[0:1] offset:288
-; GFX12-NEXT: global_store_b128 v12, v[26:29], s[0:1] offset:272
-; GFX12-NEXT: global_store_b128 v12, v[22:25], s[0:1] offset:256
-; GFX12-NEXT: global_store_b128 v12, v[62:65], s[0:1] offset:496
-; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:480
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[4:5] offset:336
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[4:5] offset:320
+; GFX12-NEXT: global_store_b128 v12, v[14:17], s[4:5] offset:304
+; GFX12-NEXT: global_store_b128 v12, v[40:43], s[4:5] offset:288
+; GFX12-NEXT: global_store_b128 v12, v[26:29], s[4:5] offset:272
+; GFX12-NEXT: global_store_b128 v12, v[22:25], s[4:5] offset:256
+; GFX12-NEXT: global_store_b128 v12, v[62:65], s[4:5] offset:496
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[4:5] offset:480
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v12, v[54:57], s[0:1] offset:464
-; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:448
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v12, v[4:7], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v12, v[54:57], s[4:5] offset:464
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[4:5] offset:448
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v12, v[4:7], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a87fa8b..a5ca228 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -38,13 +38,13 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: constant_load_i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_short v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -77,12 +77,12 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -119,13 +119,13 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v2i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -147,12 +147,12 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -198,18 +198,18 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v3i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s0
; GCN-NOHSA-VI-NEXT: flat_store_short v[2:3], v4
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v5
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -252,15 +252,15 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] offset:4
-; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5] offset:4
+; GFX12-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -299,14 +299,14 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v4i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -328,13 +328,13 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v4i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -377,16 +377,16 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v8i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -408,14 +408,14 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v8i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -608,41 +608,41 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
;
; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 12
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 10
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 8
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 6
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 4
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 30
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 28
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 26
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 14
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 12
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 10
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 8
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 6
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 30
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 28
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 26
; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[0:1]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[2:3]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v18, v[4:5]
@@ -651,35 +651,35 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[10:11]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[12:13]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v23, v[14:15]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 24
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 22
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 20
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 18
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 24
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 22
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 20
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 18
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 16
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2
; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v24, v[2:3]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5]
@@ -742,26 +742,26 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
;
; GFX12-LABEL: constant_load_v16i16_align2:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0xf
-; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
-; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
-; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
-; GFX12-NEXT: global_load_u16 v0, v8, s[0:1] offset:16
-; GFX12-NEXT: global_load_u16 v7, v8, s[0:1] offset:12
-; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
-; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
-; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
-; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
-; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
-; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
-; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
-; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
-; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
-; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
-; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
+; GFX12-NEXT: global_load_u16 v3, v8, s[2:3] offset:28
+; GFX12-NEXT: global_load_u16 v2, v8, s[2:3] offset:24
+; GFX12-NEXT: global_load_u16 v1, v8, s[2:3] offset:20
+; GFX12-NEXT: global_load_u16 v0, v8, s[2:3] offset:16
+; GFX12-NEXT: global_load_u16 v7, v8, s[2:3] offset:12
+; GFX12-NEXT: global_load_u16 v6, v8, s[2:3] offset:8
+; GFX12-NEXT: global_load_u16 v5, v8, s[2:3] offset:4
+; GFX12-NEXT: global_load_u16 v4, v8, s[2:3]
+; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[2:3] offset:30
+; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[2:3] offset:26
+; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[2:3] offset:22
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[2:3] offset:18
+; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[2:3] offset:14
+; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[2:3] offset:10
+; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[2:3] offset:6
+; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[2:3] offset:2
; GFX12-NEXT: s_wait_loadcnt 0x4
; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -808,13 +808,13 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -837,12 +837,12 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i16_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -885,13 +885,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_sshort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -915,12 +915,12 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i16_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -963,13 +963,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -992,12 +992,12 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1040,13 +1040,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_sshort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -1070,12 +1070,12 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1118,16 +1118,16 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1152,16 +1152,16 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1205,16 +1205,16 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1240,16 +1240,16 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sext_i32_i16 s3, s2
-; GFX12-NEXT: s_ashr_i32 s2, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s0
+; GFX12-NEXT: s_ashr_i32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1298,18 +1298,18 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1338,16 +1338,16 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v3i16_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX12-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1397,17 +1397,17 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -1440,16 +1440,16 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v3i16_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s2, 16
-; GFX12-NEXT: s_sext_i32_i16 s2, s2
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s3
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_ashr_i32 s2, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1503,20 +1503,20 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1545,19 +1545,19 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i16_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s4, s3, 16
-; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX12-NEXT: s_and_b32 s5, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_lshr_b32 s2, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_and_b32 s3, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1611,20 +1611,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s3, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s1, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1655,18 +1655,18 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s3, 16
-; GFX12-NEXT: s_ashr_i32 s5, s2, 16
-; GFX12-NEXT: s_sext_i32_i16 s2, s2
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_ashr_i32 s2, s1, 16
+; GFX12-NEXT: s_ashr_i32 s3, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1743,34 +1743,34 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i16_to_v8i32:
@@ -1807,26 +1807,26 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v8i16_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s8, s7, 16
-; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX12-NEXT: s_and_b32 s9, s6, 0xffff
-; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_lshr_b32 s2, s5, 16
-; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
-; GFX12-NEXT: s_lshr_b32 s5, s4, 16
-; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-NEXT: s_lshr_b32 s8, s3, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: s_and_b32 s9, s2, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_lshr_b32 s7, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1903,34 +1903,34 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s5, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s4, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s6, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s1, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s2, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v8i16_to_v8i32:
@@ -1969,26 +1969,26 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s8, s7, 16
-; GFX12-NEXT: s_ashr_i32 s9, s6, 16
-; GFX12-NEXT: s_sext_i32_i16 s6, s6
-; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_ashr_i32 s2, s5, 16
-; GFX12-NEXT: s_ashr_i32 s3, s4, 16
-; GFX12-NEXT: s_sext_i32_i16 s5, s5
-; GFX12-NEXT: s_sext_i32_i16 s4, s4
+; GFX12-NEXT: s_ashr_i32 s8, s3, 16
+; GFX12-NEXT: s_ashr_i32 s9, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s2
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_ashr_i32 s6, s1, 16
+; GFX12-NEXT: s_ashr_i32 s7, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2108,60 +2108,60 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s8, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s11, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s11, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s8, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s8, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s11, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s11, 0xffff
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s10, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s13, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s12, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s15, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s15, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s14, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i16_to_v16i32:
@@ -2219,40 +2219,40 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GFX12-LABEL: constant_zextload_v16i16_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s16, s11, 16
-; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
-; GFX12-NEXT: s_and_b32 s17, s10, 0xffff
-; GFX12-NEXT: s_lshr_b32 s10, s10, 16
-; GFX12-NEXT: s_lshr_b32 s14, s9, 16
-; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX12-NEXT: s_lshr_b32 s15, s8, 16
-; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT: s_lshr_b32 s12, s7, 16
-; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX12-NEXT: s_lshr_b32 s13, s6, 16
-; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: s_lshr_b32 s16, s15, 16
+; GFX12-NEXT: s_and_b32 s15, s15, 0xffff
+; GFX12-NEXT: s_and_b32 s17, s14, 0xffff
+; GFX12-NEXT: s_lshr_b32 s14, s14, 16
+; GFX12-NEXT: s_lshr_b32 s0, s9, 16
+; GFX12-NEXT: s_and_b32 s1, s9, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s8, 16
+; GFX12-NEXT: s_and_b32 s3, s8, 0xffff
+; GFX12-NEXT: s_lshr_b32 s6, s11, 16
+; GFX12-NEXT: s_and_b32 s7, s11, 0xffff
+; GFX12-NEXT: s_lshr_b32 s8, s10, 16
+; GFX12-NEXT: s_and_b32 s9, s10, 0xffff
+; GFX12-NEXT: s_lshr_b32 s10, s13, 16
+; GFX12-NEXT: s_and_b32 s11, s13, 0xffff
+; GFX12-NEXT: s_lshr_b32 s13, s12, 16
+; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s14
; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_lshr_b32 s2, s5, 16
-; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
-; GFX12-NEXT: s_lshr_b32 s5, s4, 16
-; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
+; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s2
+; GFX12-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2372,60 +2372,60 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s12, s5, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s13, s4, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s14, s7, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s6, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s16, s9, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s17, s8, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s11, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s10, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s9, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s8, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s9
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s11, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s10, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s16, s13, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s17, s12, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s15, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s14, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v16i16_to_v16i32:
@@ -2487,40 +2487,40 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GFX12-LABEL: constant_sextload_v16i16_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s16, s11, 16
-; GFX12-NEXT: s_ashr_i32 s17, s10, 16
-; GFX12-NEXT: s_sext_i32_i16 s10, s10
-; GFX12-NEXT: s_sext_i32_i16 s11, s11
-; GFX12-NEXT: s_ashr_i32 s14, s9, 16
-; GFX12-NEXT: s_ashr_i32 s15, s8, 16
-; GFX12-NEXT: s_sext_i32_i16 s9, s9
-; GFX12-NEXT: s_sext_i32_i16 s8, s8
+; GFX12-NEXT: s_ashr_i32 s16, s15, 16
+; GFX12-NEXT: s_ashr_i32 s17, s14, 16
+; GFX12-NEXT: s_sext_i32_i16 s14, s14
+; GFX12-NEXT: s_sext_i32_i16 s15, s15
+; GFX12-NEXT: s_ashr_i32 s0, s9, 16
+; GFX12-NEXT: s_ashr_i32 s1, s8, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s9
+; GFX12-NEXT: s_sext_i32_i16 s3, s8
+; GFX12-NEXT: s_ashr_i32 s6, s11, 16
+; GFX12-NEXT: s_ashr_i32 s7, s10, 16
+; GFX12-NEXT: s_sext_i32_i16 s8, s11
+; GFX12-NEXT: s_sext_i32_i16 s9, s10
+; GFX12-NEXT: s_ashr_i32 s10, s13, 16
+; GFX12-NEXT: s_ashr_i32 s11, s12, 16
+; GFX12-NEXT: s_sext_i32_i16 s13, s13
+; GFX12-NEXT: s_sext_i32_i16 s12, s12
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
-; GFX12-NEXT: s_ashr_i32 s12, s7, 16
-; GFX12-NEXT: s_ashr_i32 s13, s6, 16
-; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_sext_i32_i16 s6, s6
-; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_ashr_i32 s2, s5, 16
-; GFX12-NEXT: s_ashr_i32 s3, s4, 16
-; GFX12-NEXT: s_sext_i32_i16 s5, s5
-; GFX12-NEXT: s_sext_i32_i16 s4, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s11
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s7
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s2
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5397,14 +5397,14 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5429,13 +5429,13 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i16_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5485,13 +5485,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5519,15 +5519,15 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i16_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5572,14 +5572,14 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5604,13 +5604,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i16_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5655,13 +5655,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5689,15 +5689,15 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i16_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5744,18 +5744,18 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -5782,17 +5782,17 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5841,19 +5841,19 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -5882,17 +5882,17 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5954,28 +5954,28 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v4i16_to_v4i64:
@@ -6009,22 +6009,22 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i16_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s4, 0xffff, s2
+; GFX12-NEXT: s_and_b32 s2, 0xffff, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s1, 0
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6094,32 +6094,32 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s3
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s3, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s1, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s4, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -6156,25 +6156,25 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: s_lshr_b32 s8, s3, 16
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_mov_b32 s6, s1
+; GFX12-NEXT: s_lshr_b32 s8, s1, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s9
-; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6266,46 +6266,46 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s2, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i16_to_v8i64:
@@ -6357,31 +6357,32 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v8i16_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
+; GFX12-NEXT: s_and_b32 s6, 0xffff, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_pack_hl_b32_b16 s3, s3, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_pack_hl_b32_b16 s3, s2, 0
+; GFX12-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s1, 0
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_pack_hl_b32_b16 s1, s0, 0
+; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6492,57 +6493,57 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s5
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s1, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 32
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -6598,38 +6599,38 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s14, s7
-; GFX12-NEXT: s_lshr_b32 s16, s7, 16
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000
-; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
-; GFX12-NEXT: s_mov_b32 s8, s5
-; GFX12-NEXT: s_lshr_b32 s10, s5, 16
+; GFX12-NEXT: s_mov_b32 s14, s3
+; GFX12-NEXT: s_lshr_b32 s16, s3, 16
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x100000
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[0:1], 0x100000
+; GFX12-NEXT: s_mov_b32 s8, s1
+; GFX12-NEXT: s_lshr_b32 s10, s1, 16
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GFX12-NEXT: s_lshr_b32 s4, s4, 16
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v9, s15
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v9, s15
; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17
-; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v13, s9
; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s11
-; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s5
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6781,82 +6782,82 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s8, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s8, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s9, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s10, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s10, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s14, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s15, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s12, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s13, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s13, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 64
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i16_to_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index b0d8f72..5692d1d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -34,13 +34,13 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-NOHSA-LABEL: constant_load_i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -73,12 +73,12 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -117,14 +117,14 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v2i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -158,13 +158,13 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -207,15 +207,15 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -255,13 +255,13 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX12-NEXT: s_load_b96 s[0:2], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -304,16 +304,16 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v4i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -349,14 +349,14 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v4i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -893,33 +893,33 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-NOHSA-LABEL: constant_load_v11i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20
-; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20
+; GFX8-NOHSA-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1421,14 +1421,14 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX8-NOHSA-LABEL: constant_zextload_i32_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1462,12 +1462,12 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i32_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1508,15 +1508,15 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX8-NOHSA-LABEL: constant_sextload_i32_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1552,15 +1552,15 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i32_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1599,14 +1599,14 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1640,12 +1640,12 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1686,15 +1686,15 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1730,15 +1730,15 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1781,16 +1781,16 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1829,14 +1829,14 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1884,19 +1884,18 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 31
-; GFX8-NOHSA-NEXT: s_mov_b32 s1, s3
-; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s1, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1940,17 +1939,17 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s3, 31
-; GFX12-NEXT: s_ashr_i32 s5, s2, 31
+; GFX12-NEXT: s_ashr_i32 s2, s1, 31
+; GFX12-NEXT: s_ashr_i32 s3, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2004,23 +2003,23 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2070,17 +2069,17 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2145,29 +2144,29 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s5, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s4, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s7, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s6, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s0, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s3, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s2, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2226,22 +2225,22 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s8, s7, 31
-; GFX12-NEXT: s_ashr_i32 s9, s6, 31
-; GFX12-NEXT: s_ashr_i32 s2, s5, 31
-; GFX12-NEXT: s_ashr_i32 s3, s4, 31
+; GFX12-NEXT: s_ashr_i32 s8, s3, 31
+; GFX12-NEXT: s_ashr_i32 s9, s2, 31
+; GFX12-NEXT: s_ashr_i32 s6, s1, 31
+; GFX12-NEXT: s_ashr_i32 s7, s0, 31
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2683,32 +2682,32 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i32_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s16, s11, 31
-; GFX12-NEXT: s_ashr_i32 s17, s10, 31
-; GFX12-NEXT: s_ashr_i32 s14, s9, 31
-; GFX12-NEXT: s_ashr_i32 s15, s8, 31
+; GFX12-NEXT: s_ashr_i32 s16, s15, 31
+; GFX12-NEXT: s_ashr_i32 s17, s14, 31
+; GFX12-NEXT: s_ashr_i32 s6, s13, 31
+; GFX12-NEXT: s_ashr_i32 s7, s12, 31
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
-; GFX12-NEXT: s_ashr_i32 s12, s7, 31
-; GFX12-NEXT: s_ashr_i32 s13, s6, 31
-; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_ashr_i32 s2, s5, 31
-; GFX12-NEXT: s_ashr_i32 s3, s4, 31
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: s_ashr_i32 s2, s11, 31
+; GFX12-NEXT: s_ashr_i32 s3, s10, 31
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: s_ashr_i32 s0, s9, 31
+; GFX12-NEXT: s_ashr_i32 s1, s8, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s2
+; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s9
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 66c73fda..9432584 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -34,14 +34,14 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: constant_load_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -63,13 +63,13 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -111,16 +111,16 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v2i64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -142,14 +142,14 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -205,24 +205,24 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v3i64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x10
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: v_mov_b32_e32 v6, s9
; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -253,19 +253,19 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[8:9], s[6:7], 0x10
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 889755c..29ca6c6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -39,13 +39,13 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-NOHSA-LABEL: constant_load_i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_byte v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -78,12 +78,12 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
;
; GFX12-LABEL: constant_load_i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -126,13 +126,13 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v2i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -165,12 +165,12 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v2i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -217,14 +217,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v3i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 2
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
@@ -278,14 +278,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v3i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[0:1] offset:2
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[4:5] offset:2
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -322,13 +322,13 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v4i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -350,12 +350,12 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v4i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -394,14 +394,14 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v8i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -423,13 +423,13 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v8i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -472,16 +472,16 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v16i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -503,14 +503,14 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v16i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -553,13 +553,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -582,12 +582,12 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -630,13 +630,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -660,12 +660,12 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -708,13 +708,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -737,12 +737,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -785,13 +785,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -815,12 +815,12 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -868,13 +868,13 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2
; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -911,16 +911,16 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -968,13 +968,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -1011,16 +1011,16 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1070,17 +1070,17 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1112,17 +1112,17 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_and_b32 s3, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_and_b32 s1, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1173,18 +1173,18 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s2
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1216,18 +1216,18 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_sext_i32_i8 s3, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_sext_i32_i8 s1, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1280,19 +1280,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1322,19 +1322,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: s_and_b32 s4, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: s_and_b32 s2, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1386,20 +1386,20 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1431,19 +1431,19 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_ashr_i32 s3, s2, 24
-; GFX12-NEXT: s_sext_i32_i8 s4, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_ashr_i32 s1, s0, 24
+; GFX12-NEXT: s_sext_i32_i8 s2, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1518,30 +1518,30 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s6, s1, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1583,26 +1583,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s7, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 0xff
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: s_and_b32 s7, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
+; GFX12-NEXT: s_and_b32 s6, s1, 0xff
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1677,32 +1677,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1747,28 +1747,28 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: s_ashr_i32 s6, s2, 24
-; GFX12-NEXT: s_sext_i32_i8 s7, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: s_ashr_i32 s4, s3, 24
-; GFX12-NEXT: s_bfe_i32 s5, s3, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s3, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_ashr_i32 s6, s0, 24
+; GFX12-NEXT: s_sext_i32_i8 s7, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: s_ashr_i32 s2, s1, 24
+; GFX12-NEXT: s_bfe_i32 s3, s1, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s1, s1
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
-; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v4, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_mov_b32_e32 v4, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s3
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1889,55 +1889,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5
-; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s13, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s6
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s7, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7
-; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s3, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s12, s1, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s14, s2, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2
+; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s10
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1999,40 +1999,40 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT: s_lshr_b32 s8, s6, 24
-; GFX12-NEXT: s_lshr_b32 s9, s7, 24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_and_b32 s12, s6, 0xff
-; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX12-NEXT: s_and_b32 s13, s7, 0xff
-; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX12-NEXT: s_and_b32 s11, s5, 0xff
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s1
+; GFX12-NEXT: s_lshr_b32 s8, s2, 24
+; GFX12-NEXT: s_lshr_b32 s9, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s0
+; GFX12-NEXT: s_and_b32 s12, s2, 0xff
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_and_b32 s13, s3, 0xff
+; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
+; GFX12-NEXT: s_and_b32 s11, s1, 0xff
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s9
-; GFX12-NEXT: s_lshr_b32 s3, s5, 24
-; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX12-NEXT: s_lshr_b32 s7, s1, 24
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
-; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_and_b32 s10, s4, 0xff
-; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_and_b32 s10, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s6
; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_and_b32 v9, 0xffff, v9
; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_and_b32 v13, 0xffff, v13
-; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: v_mov_b32_e32 v14, s0
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2153,59 +2153,59 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s4, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s9, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s10, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s6, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s7
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s2, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s3
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v2, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2275,44 +2275,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_ashr_i32 s12, s7, 24
-; GFX12-NEXT: s_sext_i32_i8 s13, s7
-; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80010
-; GFX12-NEXT: s_ashr_i32 s10, s6, 24
-; GFX12-NEXT: s_bfe_i32 s11, s6, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s6, s6
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s0
+; GFX12-NEXT: s_ashr_i32 s12, s3, 24
+; GFX12-NEXT: s_sext_i32_i8 s13, s3
+; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80010
+; GFX12-NEXT: s_ashr_i32 s10, s2, 24
+; GFX12-NEXT: s_bfe_i32 s11, s2, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s2, s2
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s12
-; GFX12-NEXT: s_ashr_i32 s8, s5, 24
-; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s5, s5
+; GFX12-NEXT: s_ashr_i32 s8, s1, 24
+; GFX12-NEXT: s_bfe_i32 s9, s1, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s1, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v11, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: s_ashr_i32 s2, s4, 24
-; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s4, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: s_ashr_i32 s6, s0, 24
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s0, s0
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v15, s6
; GFX12-NEXT: v_mov_b32_e32 v6, s11
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: v_mov_b32_e32 v10, s9
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_mov_b32_e32 v12, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s7
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2724,71 +2724,71 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT: s_lshr_b32 s15, s9, 24
-; GFX12-NEXT: s_lshr_b32 s17, s11, 24
-; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT: s_and_b32 s23, s9, 0xff
-; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
-; GFX12-NEXT: s_and_b32 s25, s11, 0xff
-; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
-; GFX12-NEXT: s_lshr_b32 s14, s8, 24
-; GFX12-NEXT: s_lshr_b32 s16, s10, 24
-; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT: s_and_b32 s22, s8, 0xff
-; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
-; GFX12-NEXT: s_and_b32 s24, s10, 0xff
-; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s15
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s13
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s12
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s11
+; GFX12-NEXT: s_lshr_b32 s7, s13, 24
+; GFX12-NEXT: s_lshr_b32 s17, s15, 24
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s10
+; GFX12-NEXT: s_and_b32 s23, s13, 0xff
+; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT: s_and_b32 s25, s15, 0xff
+; GFX12-NEXT: s_bfe_u32 s15, s15, 0x80010
+; GFX12-NEXT: s_lshr_b32 s6, s12, 24
+; GFX12-NEXT: s_lshr_b32 s16, s14, 24
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s9
+; GFX12-NEXT: s_and_b32 s22, s12, 0xff
+; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX12-NEXT: s_and_b32 s24, s14, 0xff
+; GFX12-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s17
-; GFX12-NEXT: s_lshr_b32 s13, s7, 24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_and_b32 s21, s7, 0xff
-; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX12-NEXT: s_lshr_b32 s3, s11, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s8
+; GFX12-NEXT: s_and_b32 s21, s11, 0xff
+; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_and_b32 v13, 0xffff, v13
; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_and_b32 v25, 0xffff, v11
-; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_and_b32 v29, 0xffff, v10
+; GFX12-NEXT: v_dual_mov_b32 v30, s12 :: v_dual_and_b32 v29, 0xffff, v10
; GFX12-NEXT: v_dual_mov_b32 v24, s21 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v11, s15
-; GFX12-NEXT: v_mov_b32_e32 v26, s7
-; GFX12-NEXT: s_lshr_b32 s12, s6, 24
-; GFX12-NEXT: s_and_b32 s20, s6, 0xff
-; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_mov_b32_e32 v26, s11
+; GFX12-NEXT: s_lshr_b32 s2, s10, 24
+; GFX12-NEXT: s_and_b32 s20, s10, 0xff
+; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
-; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
-; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
-; GFX12-NEXT: s_lshr_b32 s3, s5, 24
-; GFX12-NEXT: s_and_b32 s19, s5, 0xff
-; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v27, s13 :: v_dual_mov_b32 v22, s6
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_and_b32 s18, s4, 0xff
-; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
-; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
+; GFX12-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_and_b32 v21, 0xffff, v12
+; GFX12-NEXT: v_dual_mov_b32 v31, s6 :: v_dual_mov_b32 v20, s20
+; GFX12-NEXT: s_lshr_b32 s1, s9, 24
+; GFX12-NEXT: s_and_b32 s19, s9, 0xff
+; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v27, s3 :: v_dual_mov_b32 v22, s10
+; GFX12-NEXT: s_lshr_b32 s0, s8, 24
+; GFX12-NEXT: s_and_b32 s18, s8, 0xff
+; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v23, s2 :: v_dual_mov_b32 v16, s19
+; GFX12-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s8
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3231,78 +3231,78 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT: s_ashr_i32 s20, s9, 24
-; GFX12-NEXT: s_bfe_i32 s21, s9, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s9, s9
-; GFX12-NEXT: s_ashr_i32 s24, s11, 24
-; GFX12-NEXT: s_sext_i32_i8 s25, s11
-; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80010
-; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT: s_ashr_i32 s18, s8, 24
-; GFX12-NEXT: s_bfe_i32 s19, s8, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s8, s8
-; GFX12-NEXT: s_ashr_i32 s22, s10, 24
-; GFX12-NEXT: s_bfe_i32 s23, s10, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s15
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s13
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s12
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s10
+; GFX12-NEXT: s_ashr_i32 s20, s13, 24
+; GFX12-NEXT: s_bfe_i32 s21, s13, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s13, s13
+; GFX12-NEXT: s_ashr_i32 s24, s15, 24
+; GFX12-NEXT: s_sext_i32_i8 s25, s15
+; GFX12-NEXT: s_bfe_i32 s15, s15, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s9
+; GFX12-NEXT: s_ashr_i32 s18, s12, 24
+; GFX12-NEXT: s_bfe_i32 s19, s12, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s12, s12
+; GFX12-NEXT: s_ashr_i32 s22, s14, 24
+; GFX12-NEXT: s_bfe_i32 s23, s14, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s14, s14
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_ashr_i32 s12, s5, 24
-; GFX12-NEXT: s_ashr_i32 s14, s6, 24
-; GFX12-NEXT: s_ashr_i32 s16, s7, 24
-; GFX12-NEXT: s_bfe_i32 s17, s7, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s7, s7
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s8
+; GFX12-NEXT: s_ashr_i32 s0, s8, 24
+; GFX12-NEXT: s_bfe_i32 s1, s8, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s2, s8
+; GFX12-NEXT: s_ashr_i32 s3, s9, 24
+; GFX12-NEXT: s_ashr_i32 s8, s10, 24
+; GFX12-NEXT: s_ashr_i32 s16, s11, 24
+; GFX12-NEXT: s_bfe_i32 s17, s11, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s11, s11
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
-; GFX12-NEXT: v_mov_b32_e32 v2, s11
-; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
+; GFX12-NEXT: v_mov_b32_e32 v2, s15
+; GFX12-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v23, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8
; GFX12-NEXT: v_bfe_i32 v29, v10, 0, 8
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s3
; GFX12-NEXT: v_mov_b32_e32 v11, s20
-; GFX12-NEXT: s_ashr_i32 s2, s4, 24
-; GFX12-NEXT: s_bfe_i32 s15, s6, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s6, s6
-; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
+; GFX12-NEXT: s_bfe_i32 s6, s9, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s7, s9
+; GFX12-NEXT: s_bfe_i32 s9, s10, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v31, s18
; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_dual_mov_b32 v28, s12 :: v_dual_mov_b32 v15, s0
; GFX12-NEXT: v_mov_b32_e32 v30, s19
-; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s5, s5
-; GFX12-NEXT: v_mov_b32_e32 v24, s7
+; GFX12-NEXT: v_mov_b32_e32 v24, s11
; GFX12-NEXT: v_mov_b32_e32 v26, s17
-; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s4, s4
; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v20, s6
-; GFX12-NEXT: v_mov_b32_e32 v22, s15
+; GFX12-NEXT: v_mov_b32_e32 v20, s10
+; GFX12-NEXT: v_mov_b32_e32 v22, s9
; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v16, s5
-; GFX12-NEXT: v_mov_b32_e32 v18, s13
+; GFX12-NEXT: v_mov_b32_e32 v16, s7
+; GFX12-NEXT: v_mov_b32_e32 v18, s6
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v12, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5234,14 +5234,14 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5266,13 +5266,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5318,13 +5318,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5352,15 +5352,15 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_i8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5405,13 +5405,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -5436,12 +5436,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5487,13 +5487,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5521,15 +5521,15 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_i8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5580,14 +5580,14 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_e32 v0, 0xff, v2
@@ -5627,16 +5627,16 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 8, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5689,13 +5689,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v2, 8, v0
; GFX8-NOHSA-NEXT: v_bfe_i32 v0, v0, 0, 8
@@ -5738,10 +5738,10 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
@@ -5750,7 +5750,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5812,28 +5812,28 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v4i8_to_v4i64:
@@ -5870,23 +5870,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s0
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_and_b32 s2, s2, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5958,31 +5958,31 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s0, 24
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -6021,26 +6021,26 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_lshr_b32 s4, s2, 16
-; GFX12-NEXT: s_lshr_b32 s6, s2, 24
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s6
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6132,40 +6132,40 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s3, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s6, s1, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s1, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
@@ -6227,34 +6227,34 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010
+; GFX12-NEXT: s_bfe_u32 s2, s1, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_lshr_b32 s5, s3, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
-; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
-; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_lshr_b32 s3, s1, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s0
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
+; GFX12-NEXT: s_bfe_u32 s3, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s2, s2, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: s_and_b32 s2, s3, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_and_b32 s0, s1, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6368,55 +6368,55 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_mov_b32 s3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 24
-; GFX8-NOHSA-NEXT: s_mov_b32 s4, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s1, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s0, 24
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -6479,40 +6479,40 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 8, s3
-; GFX12-NEXT: s_lshr_b32 s6, s3, 16
-; GFX12-NEXT: s_lshr_b32 s8, s2, 16
-; GFX12-NEXT: s_lshr_b32 s10, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: s_lshr_b32 s8, s0, 16
+; GFX12-NEXT: s_lshr_b32 s10, s0, 24
; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s2, s1
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[0:1], s[0:1], 56
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s13
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
+; GFX12-NEXT: v_mov_b32_e32 v12, s2
; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6664,82 +6664,82 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4
-; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s5
-; GFX8-NOHSA-NEXT: s_and_b32 s13, s7, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s14, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s6
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s3, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s11, s1, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s12, s3, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s13, s2, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s2
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i8_to_v16i64:
@@ -6833,55 +6833,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010
+; GFX12-NEXT: s_bfe_u32 s6, s3, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_lshr_b32 s3, s7, 24
-; GFX12-NEXT: s_lshr_b32 s2, s5, 24
-; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_lshr_b32 s2, s6, 24
-; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_lshr_b32 s7, s3, 24
+; GFX12-NEXT: s_lshr_b32 s6, s1, 24
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_and_b32 s2, s6, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_bfe_u32 s7, s0, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5
-; GFX12-NEXT: s_and_b32 s2, s7, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_and_b32 s2, s3, 0xff
+; GFX12-NEXT: s_and_b32 s1, s1, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:64
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
-; GFX12-NEXT: s_and_b32 s2, s5, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: s_and_b32 s2, s4, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7081,9 +7081,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s11, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 16
@@ -7092,16 +7092,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 24
; GFX8-NOHSA-NEXT: s_mov_b32 s24, s11
-; GFX8-NOHSA-NEXT: s_mov_b32 s4, s9
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s11
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s10
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s9
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s8
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 56
; GFX8-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
@@ -7110,18 +7110,18 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s10
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s4, 0x70
; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v1, 0, 8
; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v0, 0, 8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s11
-; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15
@@ -7131,53 +7131,53 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s8
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 48
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s9
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s19
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s21
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s23
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 0x60
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 64
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v4, 0, 8
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 32
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -7282,44 +7282,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[8:11], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v21, 8, s5
-; GFX12-NEXT: v_lshrrev_b16 v23, 8, s4
-; GFX12-NEXT: s_lshr_b32 s8, s7, 16
-; GFX12-NEXT: s_lshr_b32 s10, s6, 16
-; GFX12-NEXT: s_lshr_b32 s12, s6, 24
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s10
+; GFX12-NEXT: v_lshrrev_b16 v21, 8, s9
+; GFX12-NEXT: v_lshrrev_b16 v23, 8, s8
+; GFX12-NEXT: s_lshr_b32 s2, s11, 16
+; GFX12-NEXT: s_lshr_b32 s6, s10, 16
+; GFX12-NEXT: s_lshr_b32 s12, s10, 24
; GFX12-NEXT: v_bfe_i32 v22, v10, 0, 8
; GFX12-NEXT: v_bfe_i32 v10, v11, 0, 8
-; GFX12-NEXT: s_lshr_b32 s18, s4, 24
-; GFX12-NEXT: s_mov_b32 s20, s7
-; GFX12-NEXT: s_lshr_b32 s14, s5, 16
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56
+; GFX12-NEXT: s_lshr_b32 s18, s8, 24
+; GFX12-NEXT: s_mov_b32 s20, s11
+; GFX12-NEXT: s_lshr_b32 s14, s9, 16
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[10:11], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[10:11], s[10:11], 56
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: v_bfe_i32 v28, v21, 0, 8
-; GFX12-NEXT: s_lshr_b32 s16, s4, 16
-; GFX12-NEXT: s_mov_b32 s22, s5
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56
+; GFX12-NEXT: s_lshr_b32 s16, s8, 16
+; GFX12-NEXT: s_mov_b32 s22, s9
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[8:9], s[8:9], 56
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
-; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v13, s11
-; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v7, s9
+; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, s7
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s13
; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v5, s15
; GFX12-NEXT: v_bfe_i32 v24, v23, 0, 8
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v9, s25
; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21
; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v27, s23
; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22
@@ -7332,16 +7332,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19
; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v30, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v30, v[20:23], s[0:1] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX12-NEXT: global_store_b128 v30, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v30, v[20:23], s[4:5] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v30, v[12:15], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v30, v[8:11], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v30, v[4:7], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v30, v[26:29], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v30, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v30, v[22:25], s[0:1]
+; GFX12-NEXT: global_store_b128 v30, v[12:15], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v30, v[8:11], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v30, v[4:7], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v30, v[26:29], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v30, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v30, v[22:25], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7613,159 +7613,159 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s9, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s11, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s8, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s4, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s5, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s20, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s6
-; GFX8-NOHSA-NEXT: s_and_b32 s21, s7, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7
-; GFX8-NOHSA-NEXT: s_and_b32 s22, s8, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s8
-; GFX8-NOHSA-NEXT: s_and_b32 s23, s9, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s9
-; GFX8-NOHSA-NEXT: s_and_b32 s24, s10, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s10
-; GFX8-NOHSA-NEXT: s_and_b32 s25, s11, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s26, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s11, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s13, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s15, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s14, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s12, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s10, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s8, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s8
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s9, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s9
+; GFX8-NOHSA-NEXT: s_and_b32 s21, s10, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s10
+; GFX8-NOHSA-NEXT: s_and_b32 s22, s11, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s11
+; GFX8-NOHSA-NEXT: s_and_b32 s23, s12, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s12
+; GFX8-NOHSA-NEXT: s_and_b32 s24, s13, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s13
+; GFX8-NOHSA-NEXT: s_and_b32 s25, s14, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s14
+; GFX8-NOHSA-NEXT: s_and_b32 s26, s15, 0xff
; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010
; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s27, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s11, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xf0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xb0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x70
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s15, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xf0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xb0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x70
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xd0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s27
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x90
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xd0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x90
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xe0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xc0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xe0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xc0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xa0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xa0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v12
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x80
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x80
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x60
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 64
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -8970,13 +8970,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9008,12 +9008,12 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9056,13 +9056,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9096,12 +9096,12 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_i8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9144,13 +9144,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9182,12 +9182,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9230,13 +9230,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9270,12 +9270,12 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_i8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9324,14 +9324,14 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v4, 8, v2
; GFX8-NOHSA-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -9364,17 +9364,17 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9427,14 +9427,14 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NOHSA-NEXT: v_ashrrev_i16_e32 v2, 8, v2
@@ -9477,17 +9477,17 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX12-NEXT: v_ashrrev_i16 v1, 8, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9540,20 +9540,20 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s0, v3, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s1, v3, 16
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -9606,23 +9606,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s2, 16
-; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3
-; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2
-; GFX12-NEXT: s_lshr_b32 s2, s2, 24
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s1
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s0
+; GFX12-NEXT: s_lshr_b32 s0, s0, 24
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1
-; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX12-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9681,22 +9681,22 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 16
-; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s0, 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v2, 8, s2
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 16
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s1, 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s0, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v2, 8, s0
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s3
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -9757,22 +9757,22 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80000
-; GFX12-NEXT: s_lshr_b32 s3, s2, 16
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4
-; GFX12-NEXT: s_ashr_i32 s2, s2, 24
-; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x80000
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s0
+; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s2
+; GFX12-NEXT: s_ashr_i32 s0, s0, 24
+; GFX12-NEXT: s_bfe_i32 s1, s1, 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9843,29 +9843,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s3
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s4, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s0, 24
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s1, 24
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s4, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s1, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -9948,30 +9948,30 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s5, s2, 16
-; GFX12-NEXT: s_lshr_b32 s6, s3, 16
-; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3
+; GFX12-NEXT: s_lshr_b32 s3, s0, 16
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s1
; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6
-; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s3
; GFX12-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s0
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
-; GFX12-NEXT: s_lshr_b32 s2, s3, 24
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
+; GFX12-NEXT: s_lshr_b32 s0, s1, 24
; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3
-; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_lshl_or_b32 v3, s0, 16, v3
+; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v5
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10054,36 +10054,36 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s3, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s6
+; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s0, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[0:1], 56
+; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s6
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s5, 0x80000
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s5, 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s0, 16
+; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s1, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s4, 0x80000
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v1
-; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s4, 0x80000
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NOHSA-NEXT: s_or_b32 s1, s2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s3, v1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10179,29 +10179,29 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x80000
-; GFX12-NEXT: s_bfe_i32 s9, s3, 0x80000
-; GFX12-NEXT: s_lshr_b32 s6, s2, 16
-; GFX12-NEXT: s_lshr_b32 s7, s3, 16
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3
-; GFX12-NEXT: s_ashr_i64 s[4:5], s[2:3], 56
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x80000
+; GFX12-NEXT: s_bfe_i32 s9, s1, 0x80000
+; GFX12-NEXT: s_lshr_b32 s6, s0, 16
+; GFX12-NEXT: s_lshr_b32 s7, s1, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s0
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s1
+; GFX12-NEXT: s_ashr_i64 s[2:3], s[0:1], 56
; GFX12-NEXT: v_and_b32_e64 v3, 0xffff, s8
; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s9
-; GFX12-NEXT: s_ashr_i32 s2, s2, 24
-; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000
-; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: s_ashr_i32 s0, s0, 24
+; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000
+; GFX12-NEXT: s_bfe_i32 s3, s7, 0x80000
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX12-NEXT: s_pack_ll_b32_b16 s1, s3, s2
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5
-; GFX12-NEXT: v_mov_b32_e32 v3, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10314,53 +10314,53 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s1, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24
-; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s7, 0x80010
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
-; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s7, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24
; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24
; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8
-; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5
-; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s7
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s2, v3, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff
+; GFX8-NOHSA-NEXT: s_or_b32 s9, s10, s1
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8
+; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24
+; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s6, v3, 16
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10509,27 +10509,27 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s6, 16
-; GFX12-NEXT: s_lshr_b32 s9, s7, 16
-; GFX12-NEXT: s_lshr_b32 s11, s4, 16
-; GFX12-NEXT: s_lshr_b32 s13, s5, 16
-; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s5
-; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4
-; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7
-; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6
+; GFX12-NEXT: s_lshr_b32 s7, s2, 16
+; GFX12-NEXT: s_lshr_b32 s9, s3, 16
+; GFX12-NEXT: s_lshr_b32 s11, s0, 16
+; GFX12-NEXT: s_lshr_b32 s13, s1, 16
+; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s1
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s2
; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9
-; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s7
; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13
; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s11
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v3, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v0, 8, s4
-; GFX12-NEXT: v_lshrrev_b16 v2, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v7
@@ -10537,21 +10537,21 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT: s_lshr_b32 s2, s6, 24
-; GFX12-NEXT: s_lshr_b32 s8, s7, 24
-; GFX12-NEXT: s_lshr_b32 s10, s4, 24
-; GFX12-NEXT: s_lshr_b32 s12, s5, 24
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: s_lshr_b32 s8, s3, 24
+; GFX12-NEXT: s_lshr_b32 s10, s0, 24
+; GFX12-NEXT: s_lshr_b32 s12, s1, 24
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5
; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11
-; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12
+; GFX12-NEXT: v_lshl_or_b32 v5, s6, 16, v12
; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9
; GFX12-NEXT: v_lshl_or_b32 v1, s10, 16, v10
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10689,62 +10689,63 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s5, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s5
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 16
+; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s1, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s1
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NOHSA-NEXT: s_and_b32 s10, 0xffff, s10
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 16
-; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s4, 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s0, 0x80000
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s10, v0
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s4, s3, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s9, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s9
+; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s4, v1
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s0, v1
+; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s8, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s8
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s7, 16
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s9, 0x80000
-; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s7
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s3, 16
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s0, v1
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s7, 0x80000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s3
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s3, v4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s6, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s6
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s1, v4
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s3, v4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s8, 0x80000
-; GFX8-NOHSA-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s8
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s1, v4
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s6, 0x80000
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s6
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s3, v5
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s1, v5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s5, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10920,40 +10921,40 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s8, s6, 16
-; GFX12-NEXT: v_ashrrev_i16 v5, 8, s6
+; GFX12-NEXT: s_lshr_b32 s6, s2, 16
+; GFX12-NEXT: v_ashrrev_i16 v5, 8, s2
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX12-NEXT: s_lshr_b32 s8, s0, 16
+; GFX12-NEXT: s_lshr_b32 s9, s1, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s1
+; GFX12-NEXT: s_bfe_i32 s10, s1, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x80000
+; GFX12-NEXT: s_bfe_i32 s12, s3, 0x80000
+; GFX12-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
+; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6
; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX12-NEXT: s_lshr_b32 s10, s4, 16
-; GFX12-NEXT: s_lshr_b32 s11, s5, 16
-; GFX12-NEXT: v_ashrrev_i16 v1, 8, s4
-; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5
-; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000
-; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
-; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6
-; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000
-; GFX12-NEXT: s_lshr_b32 s9, s7, 16
-; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX12-NEXT: s_bfe_i32 s3, s11, 0x80000
-; GFX12-NEXT: s_bfe_i32 s4, s10, 0x80000
-; GFX12-NEXT: v_ashrrev_i16 v2, 8, s7
-; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5
+; GFX12-NEXT: s_lshr_b32 s7, s3, 16
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3
+; GFX12-NEXT: s_bfe_i32 s1, s9, 0x80000
+; GFX12-NEXT: s_bfe_i32 s3, s8, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s10
+; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s11
; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12
-; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8
+; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s2
; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6
-; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11
-; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10
-; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000
-; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3
-; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2
+; GFX12-NEXT: v_ashrrev_i16 v9, 8, s9
+; GFX12-NEXT: v_ashrrev_i16 v10, 8, s8
+; GFX12-NEXT: s_bfe_i32 s2, s7, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1
+; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s3
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s2, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s0
; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v11
@@ -10962,8 +10963,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v7, v9, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v5, v10, 16, v15
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 21e27bf..8a40901 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -40,19 +40,19 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
;
; GCN-NOHSA-VI-LABEL: global_load_i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_i16:
@@ -145,19 +145,19 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v2i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v2i16:
@@ -236,20 +236,20 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v3i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v3i16:
@@ -362,19 +362,19 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v4i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v4i16:
@@ -447,19 +447,19 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v8i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v8i16:
@@ -546,22 +546,22 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16:
@@ -696,30 +696,30 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:14
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:10
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:2
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:30
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:26
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:22
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:18
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:12
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:8
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:4
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[4:7], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:28
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:24
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:20
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:14
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:10
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:2
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:30
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:26
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:22
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:18
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:12
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:8
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:4
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:28
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:24
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:20
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s7
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -751,8 +751,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v14, v18
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v15, v19
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16_align2:
@@ -834,19 +834,19 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i32:
@@ -919,19 +919,19 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i32:
@@ -1007,19 +1007,19 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i32:
@@ -1092,19 +1092,19 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i32:
@@ -1184,21 +1184,21 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i32:
@@ -1283,21 +1283,21 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i32:
@@ -1385,22 +1385,22 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v3i16_to_v3i32:
@@ -1495,22 +1495,22 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v3i16_to_v3i32:
@@ -1613,23 +1613,23 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i32:
@@ -1729,23 +1729,23 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i32:
@@ -1859,17 +1859,17 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3
@@ -1879,8 +1879,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i16_to_v8i32:
@@ -2008,17 +2008,17 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
@@ -2028,8 +2028,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v8i16_to_v8i32:
@@ -5158,21 +5158,21 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i64:
@@ -5255,21 +5255,21 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i64:
@@ -5350,21 +5350,21 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i64:
@@ -5442,21 +5442,21 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i64:
@@ -5543,23 +5543,23 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i64:
@@ -5653,24 +5653,24 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i64:
@@ -5779,19 +5779,19 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
@@ -5799,8 +5799,8 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v9
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v8
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i64:
@@ -5925,17 +5925,17 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
@@ -5948,8 +5948,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0f9cc33..121c436 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -39,19 +39,19 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
;
; GCNX3-NOHSA-LABEL: global_load_i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_i32:
@@ -118,19 +118,19 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v2i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v2i32:
@@ -198,19 +198,19 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v3i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v3i32:
@@ -282,19 +282,19 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v4i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v4i32:
@@ -375,22 +375,22 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v8i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v8i32:
@@ -492,25 +492,25 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v9i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v9i32:
@@ -623,25 +623,25 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v10i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v10i32:
@@ -753,25 +753,25 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v11i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v11i32:
@@ -888,25 +888,25 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v12i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v12i32:
@@ -1032,28 +1032,28 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v16i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i32:
@@ -1147,20 +1147,20 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
;
; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i32_to_i64:
@@ -1230,20 +1230,20 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
;
; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i32_to_i64:
@@ -1314,20 +1314,20 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i32_to_v1i64:
@@ -1397,20 +1397,20 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i32_to_v1i64:
@@ -1487,23 +1487,23 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v3
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i32_to_v2i64:
@@ -1583,22 +1583,22 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v1
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i32_to_v2i64:
@@ -1694,27 +1694,27 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v5
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i32_to_v4i64:
@@ -1821,17 +1821,17 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
@@ -1841,8 +1841,8 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i32_to_v4i64:
@@ -1981,36 +1981,36 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i32_to_v8i64:
@@ -4515,14 +4515,14 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v32i32:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
@@ -4531,22 +4531,22 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:80
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v32i32:
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index f19eeee..76d5268 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -4,25 +4,25 @@
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_flat:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[6:7], 0xb0
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_store_b128 v[4:5], v[0:3]
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
@@ -50,25 +50,25 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_global:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB1_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[6:7], 0xb0
; GCN-NEXT: .LBB1_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
-; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
+; GCN-NEXT: global_load_b128 v[1:4], v0, s[0:1] offset:-176
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_cbranch_scc1 .LBB1_2
; GCN-NEXT: .LBB1_3: ; %for.end
; GCN-NEXT: s_nop 0
@@ -96,26 +96,26 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_constant:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: .LBB2_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
-; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_load_b128 s[8:11], s[6:7], 0x0
+; GCN-NEXT: s_prefetch_data s[6:7], 0xb0, null, 0
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
+; GCN-NEXT: s_add_nc_u64 s[6:7], s[6:7], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
-; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
-; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_cbranch_scc1 .LBB2_2
; GCN-NEXT: .LBB2_3: ; %for.end
; GCN-NEXT: s_nop 0
@@ -143,20 +143,20 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_local:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s2, 0
+; GCN-NEXT: s_cmp_eq_u32 s6, 0
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: .LBB3_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NEXT: s_add_co_i32 s2, s2, -1
-; GCN-NEXT: s_add_co_i32 s0, s0, 16
-; GCN-NEXT: s_add_co_i32 s1, s1, 16
+; GCN-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NEXT: s_add_co_i32 s6, s6, -1
+; GCN-NEXT: s_add_co_i32 s4, s4, 16
+; GCN-NEXT: s_add_co_i32 s5, s5, 16
; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_wait_dscnt 0x1
; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
; GCN-NEXT: s_wait_dscnt 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index cb3ea2e..ad4af2f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -24,13 +24,13 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce
; GCN-NEXT: ds_write_b8 v0, v1
; GCN-NEXT: ds_read_u8 v2, v0 offset:2
; GCN-NEXT: ds_read_u16 v3, v0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b8 v0, v2 offset:6
; GCN-NEXT: ds_write_b16 v0, v3 offset:4
-; GCN-NEXT: v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; GCN-NEXT: global_store_byte v0, v1, s[0:1]
+; GCN-NEXT: v_cmp_eq_u16_sdwa s[0:1], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GCN-NEXT: global_store_byte v0, v1, s[2:3]
; GCN-NEXT: s_endpgm
; CHECK-LABEL: define protected amdgpu_kernel void @test(
; CHECK-SAME: ptr addrspace(1) nocapture [[PTR_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index c6a734a..32318ab 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
@@ -37,10 +38,10 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i
; GCN-NEXT: ds_write_b32 v1, v2 offset:256
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:256
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
store i32 1, ptr addrspace(3) @a, align 4
@@ -88,11 +89,11 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v3, v0 offset:256
; GCN-NEXT: ds_read_b32 v0, v0 offset:512
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
store i32 1, ptr addrspace(3) @a, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index e9a1b38..b11cd19 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -74,7 +74,7 @@ define i64 @add_u64_vv(i64 %v, i64 %a) {
define amdgpu_kernel void @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
-; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
%a = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -83,7 +83,7 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
define amdgpu_kernel void @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
%v = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -93,7 +93,7 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GCN: s_addc_u32 s1, s5, s7
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 6707132..7361e57 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -79,26 +79,26 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
@@ -131,24 +131,24 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-LABEL: v_lshr_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_lshr_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -353,27 +353,27 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_imm_v_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_imm_v_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8
@@ -404,24 +404,24 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_imm_v_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_imm_v_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -438,26 +438,26 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3
@@ -485,24 +485,24 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_v_imm_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_v_imm_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -519,27 +519,27 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
@@ -582,26 +582,26 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-LABEL: v_lshr_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_lshr_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -620,27 +620,27 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1
@@ -673,26 +673,26 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_v_imm_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_v_imm_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 995c8c8..5fd0144 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -9,17 +9,17 @@
define amdgpu_kernel void @mad_u16(
; GFX8-LABEL: mad_u16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s10, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT: flat_load_ushort v6, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -27,57 +27,57 @@ define amdgpu_kernel void @mad_u16(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v3, v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mad_u16 v2, v6, v2, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: mad_u16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mad_u16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] glc dlc
+; GFX10-NEXT: global_load_ushort v2, v0, s[8:9] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v3, v0, s[6:7] glc dlc
+; GFX10-NEXT: global_load_ushort v3, v0, s[10:11] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mad_u16 v1, v1, v2, v3
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mad_u16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_u16 v0, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u16 v0, v1, v2, v0
-; GFX11-NEXT: global_store_b16 v3, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 400298b..c1c526c 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -959,12 +959,12 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s6, s7
-; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7
-; GFX11-NEXT: s_add_u32 s0, s2, s0
-; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: s_mul_i32 s0, s6, s7
+; GFX11-NEXT: s_mul_hi_u32 s1, s6, s7
+; GFX11-NEXT: s_add_u32 s0, s0, s2
+; GFX11-NEXT: s_addc_u32 s1, s1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -976,15 +976,15 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, s6
+; GFX12-NEXT: s_mov_b32 s0, s6
; GFX12-NEXT: s_mov_b32 s6, s7
-; GFX12-NEXT: s_mov_b32 s7, s3
+; GFX12-NEXT: s_mov_b32 s7, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[6:7]
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index def0dfa..3bb5732 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -35,14 +35,14 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX8-LABEL: madak_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
@@ -86,12 +86,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -133,12 +133,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -504,14 +504,14 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX8-LABEL: madak_inline_imm_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
@@ -555,12 +555,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -602,12 +602,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -835,13 +835,13 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-MAD-NEXT: s_clause 0x1
-; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
-; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-MAD-NEXT: s_nop 0
; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-MAD-NEXT: s_endpgm
@@ -882,11 +882,11 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -1024,20 +1024,20 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX8-LABEL: no_madak_src0_modifier_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_f32 v2, |v5|, v2, s0
@@ -1077,12 +1077,12 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v1|, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1125,12 +1125,12 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1177,20 +1177,20 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX8-LABEL: no_madak_src1_modifier_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_f32 v2, v5, |v2|, s0
@@ -1230,12 +1230,12 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v1, |v2|
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1278,12 +1278,12 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
index 2b5d32f..e8c6baa 100644
--- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
@@ -397,82 +397,82 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX10-LABEL: long_load_chain:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3e
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-NEXT: s_load_dword s3, s[0:1], 0x10
-; GFX10-NEXT: s_load_dword s4, s[0:1], 0x20
-; GFX10-NEXT: s_load_dword s5, s[0:1], 0x30
-; GFX10-NEXT: s_load_dword s6, s[0:1], 0x40
-; GFX10-NEXT: s_load_dword s7, s[0:1], 0x50
-; GFX10-NEXT: s_load_dword s8, s[0:1], 0x60
-; GFX10-NEXT: s_load_dword s9, s[0:1], 0x70
-; GFX10-NEXT: s_load_dword s10, s[0:1], 0x80
-; GFX10-NEXT: s_load_dword s11, s[0:1], 0x90
-; GFX10-NEXT: s_load_dword s12, s[0:1], 0xa0
-; GFX10-NEXT: s_load_dword s13, s[0:1], 0xb0
-; GFX10-NEXT: s_load_dword s14, s[0:1], 0xc0
-; GFX10-NEXT: s_load_dword s15, s[0:1], 0xd0
-; GFX10-NEXT: s_load_dword s16, s[0:1], 0xe0
-; GFX10-NEXT: s_load_dword s17, s[0:1], 0xf0
-; GFX10-NEXT: s_load_dword s18, s[0:1], 0x100
-; GFX10-NEXT: s_load_dword s19, s[0:1], 0x110
-; GFX10-NEXT: s_load_dword s20, s[0:1], 0x120
-; GFX10-NEXT: s_load_dword s21, s[0:1], 0x130
-; GFX10-NEXT: s_load_dword s22, s[0:1], 0x140
-; GFX10-NEXT: s_load_dword s23, s[0:1], 0x150
-; GFX10-NEXT: s_load_dword s24, s[0:1], 0x160
-; GFX10-NEXT: s_load_dword s25, s[0:1], 0x170
-; GFX10-NEXT: s_load_dword s26, s[0:1], 0x180
-; GFX10-NEXT: s_load_dword s27, s[0:1], 0x190
-; GFX10-NEXT: s_load_dword s28, s[0:1], 0x1a0
-; GFX10-NEXT: s_load_dword s29, s[0:1], 0x1b0
-; GFX10-NEXT: s_load_dword s30, s[0:1], 0x1c0
-; GFX10-NEXT: s_load_dword s31, s[0:1], 0x1d0
-; GFX10-NEXT: s_load_dword s33, s[0:1], 0x1e0
-; GFX10-NEXT: s_load_dword s34, s[0:1], 0x1f0
-; GFX10-NEXT: s_load_dword s35, s[0:1], 0x200
-; GFX10-NEXT: s_load_dword s36, s[0:1], 0x210
-; GFX10-NEXT: s_load_dword s37, s[0:1], 0x220
-; GFX10-NEXT: s_load_dword s38, s[0:1], 0x230
-; GFX10-NEXT: s_load_dword s39, s[0:1], 0x240
-; GFX10-NEXT: s_load_dword s40, s[0:1], 0x250
-; GFX10-NEXT: s_load_dword s41, s[0:1], 0x260
-; GFX10-NEXT: s_load_dword s42, s[0:1], 0x270
-; GFX10-NEXT: s_load_dword s43, s[0:1], 0x280
-; GFX10-NEXT: s_load_dword s44, s[0:1], 0x290
-; GFX10-NEXT: s_load_dword s45, s[0:1], 0x2a0
-; GFX10-NEXT: s_load_dword s46, s[0:1], 0x2b0
-; GFX10-NEXT: s_load_dword s47, s[0:1], 0x2c0
-; GFX10-NEXT: s_load_dword s48, s[0:1], 0x2d0
-; GFX10-NEXT: s_load_dword s49, s[0:1], 0x2e0
-; GFX10-NEXT: s_load_dword s50, s[0:1], 0x2f0
-; GFX10-NEXT: s_load_dword s51, s[0:1], 0x300
-; GFX10-NEXT: s_load_dword s52, s[0:1], 0x310
-; GFX10-NEXT: s_load_dword s53, s[0:1], 0x320
-; GFX10-NEXT: s_load_dword s54, s[0:1], 0x330
-; GFX10-NEXT: s_load_dword s55, s[0:1], 0x340
-; GFX10-NEXT: s_load_dword s56, s[0:1], 0x350
-; GFX10-NEXT: s_load_dword s57, s[0:1], 0x360
-; GFX10-NEXT: s_load_dword s58, s[0:1], 0x370
-; GFX10-NEXT: s_load_dword s59, s[0:1], 0x380
-; GFX10-NEXT: s_load_dword s60, s[0:1], 0x390
-; GFX10-NEXT: s_load_dword s61, s[0:1], 0x3a0
-; GFX10-NEXT: s_load_dword s62, s[0:1], 0x3b0
-; GFX10-NEXT: s_load_dword s63, s[0:1], 0x3c0
-; GFX10-NEXT: s_load_dword s64, s[0:1], 0x3d0
-; GFX10-NEXT: s_load_dword s65, s[0:1], 0x3e0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s1, s[2:3], 0x10
+; GFX10-NEXT: s_load_dword s4, s[2:3], 0x20
+; GFX10-NEXT: s_load_dword s5, s[2:3], 0x30
+; GFX10-NEXT: s_load_dword s6, s[2:3], 0x40
+; GFX10-NEXT: s_load_dword s7, s[2:3], 0x50
+; GFX10-NEXT: s_load_dword s8, s[2:3], 0x60
+; GFX10-NEXT: s_load_dword s9, s[2:3], 0x70
+; GFX10-NEXT: s_load_dword s10, s[2:3], 0x80
+; GFX10-NEXT: s_load_dword s11, s[2:3], 0x90
+; GFX10-NEXT: s_load_dword s12, s[2:3], 0xa0
+; GFX10-NEXT: s_load_dword s13, s[2:3], 0xb0
+; GFX10-NEXT: s_load_dword s14, s[2:3], 0xc0
+; GFX10-NEXT: s_load_dword s15, s[2:3], 0xd0
+; GFX10-NEXT: s_load_dword s16, s[2:3], 0xe0
+; GFX10-NEXT: s_load_dword s17, s[2:3], 0xf0
+; GFX10-NEXT: s_load_dword s18, s[2:3], 0x100
+; GFX10-NEXT: s_load_dword s19, s[2:3], 0x110
+; GFX10-NEXT: s_load_dword s20, s[2:3], 0x120
+; GFX10-NEXT: s_load_dword s21, s[2:3], 0x130
+; GFX10-NEXT: s_load_dword s22, s[2:3], 0x140
+; GFX10-NEXT: s_load_dword s23, s[2:3], 0x150
+; GFX10-NEXT: s_load_dword s24, s[2:3], 0x160
+; GFX10-NEXT: s_load_dword s25, s[2:3], 0x170
+; GFX10-NEXT: s_load_dword s26, s[2:3], 0x180
+; GFX10-NEXT: s_load_dword s27, s[2:3], 0x190
+; GFX10-NEXT: s_load_dword s28, s[2:3], 0x1a0
+; GFX10-NEXT: s_load_dword s29, s[2:3], 0x1b0
+; GFX10-NEXT: s_load_dword s30, s[2:3], 0x1c0
+; GFX10-NEXT: s_load_dword s31, s[2:3], 0x1d0
+; GFX10-NEXT: s_load_dword s33, s[2:3], 0x1e0
+; GFX10-NEXT: s_load_dword s34, s[2:3], 0x1f0
+; GFX10-NEXT: s_load_dword s35, s[2:3], 0x200
+; GFX10-NEXT: s_load_dword s36, s[2:3], 0x210
+; GFX10-NEXT: s_load_dword s37, s[2:3], 0x220
+; GFX10-NEXT: s_load_dword s38, s[2:3], 0x230
+; GFX10-NEXT: s_load_dword s39, s[2:3], 0x240
+; GFX10-NEXT: s_load_dword s40, s[2:3], 0x250
+; GFX10-NEXT: s_load_dword s41, s[2:3], 0x260
+; GFX10-NEXT: s_load_dword s42, s[2:3], 0x270
+; GFX10-NEXT: s_load_dword s43, s[2:3], 0x280
+; GFX10-NEXT: s_load_dword s44, s[2:3], 0x290
+; GFX10-NEXT: s_load_dword s45, s[2:3], 0x2a0
+; GFX10-NEXT: s_load_dword s46, s[2:3], 0x2b0
+; GFX10-NEXT: s_load_dword s47, s[2:3], 0x2c0
+; GFX10-NEXT: s_load_dword s48, s[2:3], 0x2d0
+; GFX10-NEXT: s_load_dword s49, s[2:3], 0x2e0
+; GFX10-NEXT: s_load_dword s50, s[2:3], 0x2f0
+; GFX10-NEXT: s_load_dword s51, s[2:3], 0x300
+; GFX10-NEXT: s_load_dword s52, s[2:3], 0x310
+; GFX10-NEXT: s_load_dword s53, s[2:3], 0x320
+; GFX10-NEXT: s_load_dword s54, s[2:3], 0x330
+; GFX10-NEXT: s_load_dword s55, s[2:3], 0x340
+; GFX10-NEXT: s_load_dword s56, s[2:3], 0x350
+; GFX10-NEXT: s_load_dword s57, s[2:3], 0x360
+; GFX10-NEXT: s_load_dword s58, s[2:3], 0x370
+; GFX10-NEXT: s_load_dword s59, s[2:3], 0x380
+; GFX10-NEXT: s_load_dword s60, s[2:3], 0x390
+; GFX10-NEXT: s_load_dword s61, s[2:3], 0x3a0
+; GFX10-NEXT: s_load_dword s62, s[2:3], 0x3b0
+; GFX10-NEXT: s_load_dword s63, s[2:3], 0x3c0
+; GFX10-NEXT: s_load_dword s64, s[2:3], 0x3d0
+; GFX10-NEXT: s_load_dword s65, s[2:3], 0x3e0
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dword s66, s[0:1], 0x3f0
-; GFX10-NEXT: s_load_dword s67, s[0:1], 0x400
-; GFX10-NEXT: s_load_dword s0, s[0:1], 0x410
+; GFX10-NEXT: s_load_dword s66, s[2:3], 0x3f0
+; GFX10-NEXT: s_load_dword s67, s[2:3], 0x400
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x410
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s2
+; GFX10-NEXT: ; use s0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s3
+; GFX10-NEXT: ; use s1
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
@@ -664,89 +664,89 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX10-NEXT: ; use s67
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s0
+; GFX10-NEXT: ; use s2
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: long_load_chain:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x10
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x20
-; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x30
-; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x40
-; GFX11-NEXT: s_load_b32 s7, s[0:1], 0x50
-; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x60
-; GFX11-NEXT: s_load_b32 s9, s[0:1], 0x70
-; GFX11-NEXT: s_load_b32 s10, s[0:1], 0x80
-; GFX11-NEXT: s_load_b32 s11, s[0:1], 0x90
-; GFX11-NEXT: s_load_b32 s12, s[0:1], 0xa0
-; GFX11-NEXT: s_load_b32 s13, s[0:1], 0xb0
-; GFX11-NEXT: s_load_b32 s14, s[0:1], 0xc0
-; GFX11-NEXT: s_load_b32 s15, s[0:1], 0xd0
-; GFX11-NEXT: s_load_b32 s16, s[0:1], 0xe0
-; GFX11-NEXT: s_load_b32 s17, s[0:1], 0xf0
-; GFX11-NEXT: s_load_b32 s18, s[0:1], 0x100
-; GFX11-NEXT: s_load_b32 s19, s[0:1], 0x110
-; GFX11-NEXT: s_load_b32 s20, s[0:1], 0x120
-; GFX11-NEXT: s_load_b32 s21, s[0:1], 0x130
-; GFX11-NEXT: s_load_b32 s22, s[0:1], 0x140
-; GFX11-NEXT: s_load_b32 s23, s[0:1], 0x150
-; GFX11-NEXT: s_load_b32 s24, s[0:1], 0x160
-; GFX11-NEXT: s_load_b32 s25, s[0:1], 0x170
-; GFX11-NEXT: s_load_b32 s26, s[0:1], 0x180
-; GFX11-NEXT: s_load_b32 s27, s[0:1], 0x190
-; GFX11-NEXT: s_load_b32 s28, s[0:1], 0x1a0
-; GFX11-NEXT: s_load_b32 s29, s[0:1], 0x1b0
-; GFX11-NEXT: s_load_b32 s30, s[0:1], 0x1c0
-; GFX11-NEXT: s_load_b32 s31, s[0:1], 0x1d0
-; GFX11-NEXT: s_load_b32 s33, s[0:1], 0x1e0
-; GFX11-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x10
+; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x20
+; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x30
+; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x40
+; GFX11-NEXT: s_load_b32 s7, s[2:3], 0x50
+; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x60
+; GFX11-NEXT: s_load_b32 s9, s[2:3], 0x70
+; GFX11-NEXT: s_load_b32 s10, s[2:3], 0x80
+; GFX11-NEXT: s_load_b32 s11, s[2:3], 0x90
+; GFX11-NEXT: s_load_b32 s12, s[2:3], 0xa0
+; GFX11-NEXT: s_load_b32 s13, s[2:3], 0xb0
+; GFX11-NEXT: s_load_b32 s14, s[2:3], 0xc0
+; GFX11-NEXT: s_load_b32 s15, s[2:3], 0xd0
+; GFX11-NEXT: s_load_b32 s16, s[2:3], 0xe0
+; GFX11-NEXT: s_load_b32 s17, s[2:3], 0xf0
+; GFX11-NEXT: s_load_b32 s18, s[2:3], 0x100
+; GFX11-NEXT: s_load_b32 s19, s[2:3], 0x110
+; GFX11-NEXT: s_load_b32 s20, s[2:3], 0x120
+; GFX11-NEXT: s_load_b32 s21, s[2:3], 0x130
+; GFX11-NEXT: s_load_b32 s22, s[2:3], 0x140
+; GFX11-NEXT: s_load_b32 s23, s[2:3], 0x150
+; GFX11-NEXT: s_load_b32 s24, s[2:3], 0x160
+; GFX11-NEXT: s_load_b32 s25, s[2:3], 0x170
+; GFX11-NEXT: s_load_b32 s26, s[2:3], 0x180
+; GFX11-NEXT: s_load_b32 s27, s[2:3], 0x190
+; GFX11-NEXT: s_load_b32 s28, s[2:3], 0x1a0
+; GFX11-NEXT: s_load_b32 s29, s[2:3], 0x1b0
+; GFX11-NEXT: s_load_b32 s30, s[2:3], 0x1c0
+; GFX11-NEXT: s_load_b32 s31, s[2:3], 0x1d0
+; GFX11-NEXT: s_load_b32 s33, s[2:3], 0x1e0
+; GFX11-NEXT: s_load_b32 s34, s[2:3], 0x1f0
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: s_load_b32 s35, s[0:1], 0x200
-; GFX11-NEXT: s_load_b32 s36, s[0:1], 0x210
-; GFX11-NEXT: s_load_b32 s37, s[0:1], 0x220
-; GFX11-NEXT: s_load_b32 s38, s[0:1], 0x230
-; GFX11-NEXT: s_load_b32 s39, s[0:1], 0x240
-; GFX11-NEXT: s_load_b32 s40, s[0:1], 0x250
-; GFX11-NEXT: s_load_b32 s41, s[0:1], 0x260
-; GFX11-NEXT: s_load_b32 s42, s[0:1], 0x270
-; GFX11-NEXT: s_load_b32 s43, s[0:1], 0x280
-; GFX11-NEXT: s_load_b32 s44, s[0:1], 0x290
-; GFX11-NEXT: s_load_b32 s45, s[0:1], 0x2a0
-; GFX11-NEXT: s_load_b32 s46, s[0:1], 0x2b0
-; GFX11-NEXT: s_load_b32 s47, s[0:1], 0x2c0
-; GFX11-NEXT: s_load_b32 s48, s[0:1], 0x2d0
-; GFX11-NEXT: s_load_b32 s49, s[0:1], 0x2e0
-; GFX11-NEXT: s_load_b32 s50, s[0:1], 0x2f0
-; GFX11-NEXT: s_load_b32 s51, s[0:1], 0x300
-; GFX11-NEXT: s_load_b32 s52, s[0:1], 0x310
-; GFX11-NEXT: s_load_b32 s53, s[0:1], 0x320
-; GFX11-NEXT: s_load_b32 s54, s[0:1], 0x330
-; GFX11-NEXT: s_load_b32 s55, s[0:1], 0x340
-; GFX11-NEXT: s_load_b32 s56, s[0:1], 0x350
-; GFX11-NEXT: s_load_b32 s57, s[0:1], 0x360
-; GFX11-NEXT: s_load_b32 s58, s[0:1], 0x370
-; GFX11-NEXT: s_load_b32 s59, s[0:1], 0x380
-; GFX11-NEXT: s_load_b32 s60, s[0:1], 0x390
-; GFX11-NEXT: s_load_b32 s61, s[0:1], 0x3a0
-; GFX11-NEXT: s_load_b32 s62, s[0:1], 0x3b0
-; GFX11-NEXT: s_load_b32 s63, s[0:1], 0x3c0
-; GFX11-NEXT: s_load_b32 s64, s[0:1], 0x3d0
-; GFX11-NEXT: s_load_b32 s65, s[0:1], 0x3e0
-; GFX11-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX11-NEXT: s_load_b32 s35, s[2:3], 0x200
+; GFX11-NEXT: s_load_b32 s36, s[2:3], 0x210
+; GFX11-NEXT: s_load_b32 s37, s[2:3], 0x220
+; GFX11-NEXT: s_load_b32 s38, s[2:3], 0x230
+; GFX11-NEXT: s_load_b32 s39, s[2:3], 0x240
+; GFX11-NEXT: s_load_b32 s40, s[2:3], 0x250
+; GFX11-NEXT: s_load_b32 s41, s[2:3], 0x260
+; GFX11-NEXT: s_load_b32 s42, s[2:3], 0x270
+; GFX11-NEXT: s_load_b32 s43, s[2:3], 0x280
+; GFX11-NEXT: s_load_b32 s44, s[2:3], 0x290
+; GFX11-NEXT: s_load_b32 s45, s[2:3], 0x2a0
+; GFX11-NEXT: s_load_b32 s46, s[2:3], 0x2b0
+; GFX11-NEXT: s_load_b32 s47, s[2:3], 0x2c0
+; GFX11-NEXT: s_load_b32 s48, s[2:3], 0x2d0
+; GFX11-NEXT: s_load_b32 s49, s[2:3], 0x2e0
+; GFX11-NEXT: s_load_b32 s50, s[2:3], 0x2f0
+; GFX11-NEXT: s_load_b32 s51, s[2:3], 0x300
+; GFX11-NEXT: s_load_b32 s52, s[2:3], 0x310
+; GFX11-NEXT: s_load_b32 s53, s[2:3], 0x320
+; GFX11-NEXT: s_load_b32 s54, s[2:3], 0x330
+; GFX11-NEXT: s_load_b32 s55, s[2:3], 0x340
+; GFX11-NEXT: s_load_b32 s56, s[2:3], 0x350
+; GFX11-NEXT: s_load_b32 s57, s[2:3], 0x360
+; GFX11-NEXT: s_load_b32 s58, s[2:3], 0x370
+; GFX11-NEXT: s_load_b32 s59, s[2:3], 0x380
+; GFX11-NEXT: s_load_b32 s60, s[2:3], 0x390
+; GFX11-NEXT: s_load_b32 s61, s[2:3], 0x3a0
+; GFX11-NEXT: s_load_b32 s62, s[2:3], 0x3b0
+; GFX11-NEXT: s_load_b32 s63, s[2:3], 0x3c0
+; GFX11-NEXT: s_load_b32 s64, s[2:3], 0x3d0
+; GFX11-NEXT: s_load_b32 s65, s[2:3], 0x3e0
+; GFX11-NEXT: s_load_b32 s66, s[2:3], 0x3f0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s67, s[0:1], 0x400
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX11-NEXT: s_load_b32 s67, s[2:3], 0x400
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x410
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s2
+; GFX11-NEXT: ; use s0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s3
+; GFX11-NEXT: ; use s1
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
@@ -938,89 +938,89 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX11-NEXT: ; use s67
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s0
+; GFX11-NEXT: ; use s2
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: long_load_chain:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x10
-; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x20
-; GFX12-NEXT: s_load_b32 s5, s[0:1], 0x30
-; GFX12-NEXT: s_load_b32 s6, s[0:1], 0x40
-; GFX12-NEXT: s_load_b32 s7, s[0:1], 0x50
-; GFX12-NEXT: s_load_b32 s8, s[0:1], 0x60
-; GFX12-NEXT: s_load_b32 s9, s[0:1], 0x70
-; GFX12-NEXT: s_load_b32 s10, s[0:1], 0x80
-; GFX12-NEXT: s_load_b32 s11, s[0:1], 0x90
-; GFX12-NEXT: s_load_b32 s12, s[0:1], 0xa0
-; GFX12-NEXT: s_load_b32 s13, s[0:1], 0xb0
-; GFX12-NEXT: s_load_b32 s14, s[0:1], 0xc0
-; GFX12-NEXT: s_load_b32 s15, s[0:1], 0xd0
-; GFX12-NEXT: s_load_b32 s16, s[0:1], 0xe0
-; GFX12-NEXT: s_load_b32 s17, s[0:1], 0xf0
-; GFX12-NEXT: s_load_b32 s18, s[0:1], 0x100
-; GFX12-NEXT: s_load_b32 s19, s[0:1], 0x110
-; GFX12-NEXT: s_load_b32 s20, s[0:1], 0x120
-; GFX12-NEXT: s_load_b32 s21, s[0:1], 0x130
-; GFX12-NEXT: s_load_b32 s22, s[0:1], 0x140
-; GFX12-NEXT: s_load_b32 s23, s[0:1], 0x150
-; GFX12-NEXT: s_load_b32 s24, s[0:1], 0x160
-; GFX12-NEXT: s_load_b32 s25, s[0:1], 0x170
-; GFX12-NEXT: s_load_b32 s26, s[0:1], 0x180
-; GFX12-NEXT: s_load_b32 s27, s[0:1], 0x190
-; GFX12-NEXT: s_load_b32 s28, s[0:1], 0x1a0
-; GFX12-NEXT: s_load_b32 s29, s[0:1], 0x1b0
-; GFX12-NEXT: s_load_b32 s30, s[0:1], 0x1c0
-; GFX12-NEXT: s_load_b32 s31, s[0:1], 0x1d0
-; GFX12-NEXT: s_load_b32 s33, s[0:1], 0x1e0
-; GFX12-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x10
+; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x20
+; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x30
+; GFX12-NEXT: s_load_b32 s6, s[2:3], 0x40
+; GFX12-NEXT: s_load_b32 s7, s[2:3], 0x50
+; GFX12-NEXT: s_load_b32 s8, s[2:3], 0x60
+; GFX12-NEXT: s_load_b32 s9, s[2:3], 0x70
+; GFX12-NEXT: s_load_b32 s10, s[2:3], 0x80
+; GFX12-NEXT: s_load_b32 s11, s[2:3], 0x90
+; GFX12-NEXT: s_load_b32 s12, s[2:3], 0xa0
+; GFX12-NEXT: s_load_b32 s13, s[2:3], 0xb0
+; GFX12-NEXT: s_load_b32 s14, s[2:3], 0xc0
+; GFX12-NEXT: s_load_b32 s15, s[2:3], 0xd0
+; GFX12-NEXT: s_load_b32 s16, s[2:3], 0xe0
+; GFX12-NEXT: s_load_b32 s17, s[2:3], 0xf0
+; GFX12-NEXT: s_load_b32 s18, s[2:3], 0x100
+; GFX12-NEXT: s_load_b32 s19, s[2:3], 0x110
+; GFX12-NEXT: s_load_b32 s20, s[2:3], 0x120
+; GFX12-NEXT: s_load_b32 s21, s[2:3], 0x130
+; GFX12-NEXT: s_load_b32 s22, s[2:3], 0x140
+; GFX12-NEXT: s_load_b32 s23, s[2:3], 0x150
+; GFX12-NEXT: s_load_b32 s24, s[2:3], 0x160
+; GFX12-NEXT: s_load_b32 s25, s[2:3], 0x170
+; GFX12-NEXT: s_load_b32 s26, s[2:3], 0x180
+; GFX12-NEXT: s_load_b32 s27, s[2:3], 0x190
+; GFX12-NEXT: s_load_b32 s28, s[2:3], 0x1a0
+; GFX12-NEXT: s_load_b32 s29, s[2:3], 0x1b0
+; GFX12-NEXT: s_load_b32 s30, s[2:3], 0x1c0
+; GFX12-NEXT: s_load_b32 s31, s[2:3], 0x1d0
+; GFX12-NEXT: s_load_b32 s33, s[2:3], 0x1e0
+; GFX12-NEXT: s_load_b32 s34, s[2:3], 0x1f0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: s_load_b32 s35, s[0:1], 0x200
-; GFX12-NEXT: s_load_b32 s36, s[0:1], 0x210
-; GFX12-NEXT: s_load_b32 s37, s[0:1], 0x220
-; GFX12-NEXT: s_load_b32 s38, s[0:1], 0x230
-; GFX12-NEXT: s_load_b32 s39, s[0:1], 0x240
-; GFX12-NEXT: s_load_b32 s40, s[0:1], 0x250
-; GFX12-NEXT: s_load_b32 s41, s[0:1], 0x260
-; GFX12-NEXT: s_load_b32 s42, s[0:1], 0x270
-; GFX12-NEXT: s_load_b32 s43, s[0:1], 0x280
-; GFX12-NEXT: s_load_b32 s44, s[0:1], 0x290
-; GFX12-NEXT: s_load_b32 s45, s[0:1], 0x2a0
-; GFX12-NEXT: s_load_b32 s46, s[0:1], 0x2b0
-; GFX12-NEXT: s_load_b32 s47, s[0:1], 0x2c0
-; GFX12-NEXT: s_load_b32 s48, s[0:1], 0x2d0
-; GFX12-NEXT: s_load_b32 s49, s[0:1], 0x2e0
-; GFX12-NEXT: s_load_b32 s50, s[0:1], 0x2f0
-; GFX12-NEXT: s_load_b32 s51, s[0:1], 0x300
-; GFX12-NEXT: s_load_b32 s52, s[0:1], 0x310
-; GFX12-NEXT: s_load_b32 s53, s[0:1], 0x320
-; GFX12-NEXT: s_load_b32 s54, s[0:1], 0x330
-; GFX12-NEXT: s_load_b32 s55, s[0:1], 0x340
-; GFX12-NEXT: s_load_b32 s56, s[0:1], 0x350
-; GFX12-NEXT: s_load_b32 s57, s[0:1], 0x360
-; GFX12-NEXT: s_load_b32 s58, s[0:1], 0x370
-; GFX12-NEXT: s_load_b32 s59, s[0:1], 0x380
-; GFX12-NEXT: s_load_b32 s60, s[0:1], 0x390
-; GFX12-NEXT: s_load_b32 s61, s[0:1], 0x3a0
-; GFX12-NEXT: s_load_b32 s62, s[0:1], 0x3b0
-; GFX12-NEXT: s_load_b32 s63, s[0:1], 0x3c0
-; GFX12-NEXT: s_load_b32 s64, s[0:1], 0x3d0
-; GFX12-NEXT: s_load_b32 s65, s[0:1], 0x3e0
-; GFX12-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX12-NEXT: s_load_b32 s35, s[2:3], 0x200
+; GFX12-NEXT: s_load_b32 s36, s[2:3], 0x210
+; GFX12-NEXT: s_load_b32 s37, s[2:3], 0x220
+; GFX12-NEXT: s_load_b32 s38, s[2:3], 0x230
+; GFX12-NEXT: s_load_b32 s39, s[2:3], 0x240
+; GFX12-NEXT: s_load_b32 s40, s[2:3], 0x250
+; GFX12-NEXT: s_load_b32 s41, s[2:3], 0x260
+; GFX12-NEXT: s_load_b32 s42, s[2:3], 0x270
+; GFX12-NEXT: s_load_b32 s43, s[2:3], 0x280
+; GFX12-NEXT: s_load_b32 s44, s[2:3], 0x290
+; GFX12-NEXT: s_load_b32 s45, s[2:3], 0x2a0
+; GFX12-NEXT: s_load_b32 s46, s[2:3], 0x2b0
+; GFX12-NEXT: s_load_b32 s47, s[2:3], 0x2c0
+; GFX12-NEXT: s_load_b32 s48, s[2:3], 0x2d0
+; GFX12-NEXT: s_load_b32 s49, s[2:3], 0x2e0
+; GFX12-NEXT: s_load_b32 s50, s[2:3], 0x2f0
+; GFX12-NEXT: s_load_b32 s51, s[2:3], 0x300
+; GFX12-NEXT: s_load_b32 s52, s[2:3], 0x310
+; GFX12-NEXT: s_load_b32 s53, s[2:3], 0x320
+; GFX12-NEXT: s_load_b32 s54, s[2:3], 0x330
+; GFX12-NEXT: s_load_b32 s55, s[2:3], 0x340
+; GFX12-NEXT: s_load_b32 s56, s[2:3], 0x350
+; GFX12-NEXT: s_load_b32 s57, s[2:3], 0x360
+; GFX12-NEXT: s_load_b32 s58, s[2:3], 0x370
+; GFX12-NEXT: s_load_b32 s59, s[2:3], 0x380
+; GFX12-NEXT: s_load_b32 s60, s[2:3], 0x390
+; GFX12-NEXT: s_load_b32 s61, s[2:3], 0x3a0
+; GFX12-NEXT: s_load_b32 s62, s[2:3], 0x3b0
+; GFX12-NEXT: s_load_b32 s63, s[2:3], 0x3c0
+; GFX12-NEXT: s_load_b32 s64, s[2:3], 0x3d0
+; GFX12-NEXT: s_load_b32 s65, s[2:3], 0x3e0
+; GFX12-NEXT: s_load_b32 s66, s[2:3], 0x3f0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s67, s[0:1], 0x400
-; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX12-NEXT: s_load_b32 s67, s[2:3], 0x400
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x410
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s2
+; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s3
+; GFX12-NEXT: ; use s1
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s4
@@ -1212,7 +1212,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX12-NEXT: ; use s67
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s0
+; GFX12-NEXT: ; use s2
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_endpgm
%v0 = load i32, ptr addrspace(1) %p
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 8ef2ca2..920b6cc 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_imax_sge_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -55,14 +55,14 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
@@ -105,14 +105,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v3i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
@@ -176,14 +176,14 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -230,14 +230,14 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_imax_sgt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -278,14 +278,14 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_umax_uge_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -326,14 +326,14 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_umax_ugt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -373,14 +373,14 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_umax_ugt_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 5c88328..51b6410 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -267,9 +267,10 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap
;
; GCN-SCRATCH-LABEL: vector_clause_indirect:
; GCN-SCRATCH: ; %bb.0: ; %bb
+; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3]
@@ -278,9 +279,9 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap
; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off
; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16
; GCN-SCRATCH-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -416,22 +417,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GCN-SCRATCH-NEXT: s_clause 0x1
-; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
-; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44
+; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000
-; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1
-; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
+; GCN-SCRATCH-NEXT: s_brev_b32 s0, 1
+; GCN-SCRATCH-NEXT: s_mov_b32 s1, s0
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: ;;#ASMSTART
; GCN-SCRATCH-NEXT: ;;#ASMEND
; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10
-; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11
-; GCN-SCRATCH-NEXT: s_mov_b32 s11, 0
-; GCN-SCRATCH-NEXT: s_mov_b32 s10, s8
-; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s2
+; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s3
+; GCN-SCRATCH-NEXT: s_mov_b32 s3, 0
+; GCN-SCRATCH-NEXT: s_mov_b32 s2, s0
+; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v2, v0
; GCN-SCRATCH-NEXT: exp mrt0 v0, off, off, off done vm
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 2334543..9586684 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2478,20 +2478,20 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
;
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[4:5], 0x0
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_load_dword s2, s[12:13], 0x0
+; VI-NEXT: s_load_dword s3, s[14:15], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lt_u32 s4, s5
+; VI-NEXT: s_cmp_lt_u32 s2, s3
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT: s_cselect_b32 s0, s4, s5
+; VI-NEXT: s_cselect_b32 s0, s2, s3
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: flat_store_byte v[2:3], v4
@@ -2499,58 +2499,58 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
;
; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s2, s[12:13], 0x0
+; GFX9-NEXT: s_load_dword s3, s[14:15], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lt_u32 s8, s9
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: s_cselect_b32 s4, s8, s9
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: s_cmp_lt_u32 s2, s3
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s2, s3
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0
-; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0
+; GFX10-NEXT: s_load_dword s0, s[12:13], 0x0
+; GFX10-NEXT: s_load_dword s1, s[14:15], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lt_u32 s8, s9
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT: s_cselect_b32 s4, s8, s9
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
-; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT: s_cmp_lt_u32 s0, s1
+; GFX10-NEXT: s_cselect_b32 s2, -1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_cselect_b32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: global_store_dword v1, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[8:9], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[10:11], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lt_u32 s4, s5
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: s_cmp_lt_u32 s0, s1
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s4, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
-; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v1, v2, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2629,18 +2629,18 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
;
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: v_mov_b32_e32 v1, s13
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
@@ -2651,50 +2651,50 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
;
; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX10-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: global_store_short v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_u16 v1, v0, s[8:9]
+; GFX11-NEXT: global_load_u16 v2, v0, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
index 99120ab..70082e9 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
@@ -7,16 +7,16 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%res = add i64 %lhs, 123456789123456789
@@ -30,15 +30,15 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%rhs = load volatile i64, ptr addrspace(1) %ptr
@@ -53,16 +53,16 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -28744524
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%res = sub i64 %lhs, 123456789123456789
@@ -76,16 +76,16 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE]], [[GLOBAL_LOAD_DWORDX2_SADDR]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%rhs = load volatile i64, ptr addrspace(1) %ptr
%res = sub i64 123456789123456789, %rhs
@@ -99,15 +99,15 @@ define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%rhs = load volatile i64, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
index 1cd9afe..db33ee8 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -7,13 +7,13 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.exp2.f32(float %val)
@@ -27,14 +27,14 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.exp2.f16(half %val)
@@ -48,13 +48,13 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.log.f32(float %val)
@@ -68,14 +68,14 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.log.f16(half %val)
@@ -89,13 +89,13 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.rcp.f32(float %val)
@@ -109,14 +109,14 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.rcp.f16(half %val)
@@ -130,13 +130,13 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.rsq.f32(float %val)
@@ -150,14 +150,14 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.rsq.f16(half %val)
@@ -171,13 +171,13 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.sqrt.f32(float %val)
@@ -191,14 +191,14 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.sqrt.f16(half %val)
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 3c60153..c0e0b50 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -31,99 +31,99 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_mul_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v1, v1, v3
; VI-NEXT: v_mul_lo_u32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_mul_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_mul_v2i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_mul_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_mul_v2i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -179,117 +179,117 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_mul_v4i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v3, v3, v7
; VI-NEXT: v_mul_lo_u32 v2, v2, v6
; VI-NEXT: v_mul_lo_u32 v1, v1, v5
; VI-NEXT: v_mul_lo_u32 v0, v0, v4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_v4i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_v4i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s0, s4
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_v4i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_v4i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -524,23 +524,23 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -549,23 +549,23 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s10, -1
-; GFX12-NEXT: s_mov_b32 s11, 0x31016000
-; GFX12-NEXT: s_mov_b32 s14, s10
-; GFX12-NEXT: s_mov_b32 s15, s11
-; GFX12-NEXT: s_mov_b32 s2, s10
-; GFX12-NEXT: s_mov_b32 s3, s11
+; GFX12-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s2
+; GFX12-NEXT: s_mov_b32 s15, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s6
; GFX12-NEXT: s_mov_b32 s13, s7
; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null
-; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -618,14 +618,13 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; VI-LABEL: mul64_sext_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x50
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_nop 2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], s2, v0, 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul64_sext_c:
@@ -661,15 +660,15 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
-; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50
+; GFX11-NEXT: s_mul_i32 s0, s2, 0x50
+; GFX11-NEXT: s_mul_hi_i32 s1, s2, 0x50
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -729,14 +728,13 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; VI-LABEL: mul64_zext_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x50
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_nop 2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v0, 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul64_zext_c:
@@ -772,15 +770,15 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
-; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50
+; GFX11-NEXT: s_mul_i32 s0, s2, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s1, s2, 0x50
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -845,100 +843,101 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_mul64_sext_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_movk_i32 s2, 0x50
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_movk_i32 s0, 0x50
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], v0, s0, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_sext_c:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_movk_i32 s2, 0x50
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x50
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_sext_c:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_sext_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_sext_c:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -992,100 +991,101 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_mul64_zext_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_movk_i32 s2, 0x50
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_movk_i32 s0, 0x50
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s0, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_zext_c:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_movk_i32 s2, 0x50
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x50
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mul_hi_u32 v1, v0, s0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_zext_c:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_zext_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_zext_c:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1138,98 +1138,99 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: v_mul64_sext_inline_imm:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], v0, 9, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_sext_inline_imm:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_sext_inline_imm:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_sext_inline_imm:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_sext_inline_imm:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1276,15 +1277,15 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
;
; VI-LABEL: s_mul_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x70
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dword s3, s[0:1], 0x70
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i32:
@@ -1319,13 +1320,14 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s2, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mul_i32 s0, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1335,13 +1337,14 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_i32 s2, s2, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: s_mov_b32 s2, -1
-; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_mul_i32 s0, s2, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1383,94 +1386,94 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: v_mul_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1517,16 +1520,16 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
;
; VI-LABEL: s_mul_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x70
-; VI-NEXT: s_load_dword s5, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x70
+; VI-NEXT: s_load_dword s3, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_lo_u16_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i1:
@@ -1562,14 +1565,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1579,14 +1582,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
+; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1647,109 +1650,109 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: v_mul_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s0, s4
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i1:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null
; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
+; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1881,17 +1884,17 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s1, s6, s1
-; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s2, s1
-; GFX11-NEXT: s_mul_i32 s2, s7, s0
-; GFX11-NEXT: s_mul_i32 s0, s6, s0
-; GFX11-NEXT: s_add_i32 s1, s1, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_mul_i32 s0, s6, s3
+; GFX11-NEXT: s_mul_hi_u32 s1, s6, s2
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_mul_i32 s1, s7, s2
+; GFX11-NEXT: s_mul_i32 s2, s6, s2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_mov_b32 s1, s5
@@ -1904,9 +1907,9 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_mov_b32 s7, 0x31016000
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_mov_b32 s6, -1
@@ -2049,20 +2052,20 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0
@@ -2071,7 +2074,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2080,20 +2083,20 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s10, -1
-; GFX12-NEXT: s_mov_b32 s11, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, s10
-; GFX12-NEXT: s_mov_b32 s3, s11
-; GFX12-NEXT: s_mov_b32 s14, s10
-; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s14, s2
+; GFX12-NEXT: s_mov_b32 s15, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s6
; GFX12-NEXT: s_mov_b32 s13, s7
-; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
@@ -2102,7 +2105,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2181,32 +2184,32 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB15_2
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: s_mul_i32 s6, s2, s3
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_mul_i32 s8, s2, s3
+; VI-NEXT: s_mov_b64 s[2:3], 0
; VI-NEXT: s_branch .LBB15_3
; VI-NEXT: .LBB15_2:
-; VI-NEXT: s_mov_b64 s[4:5], -1
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: s_mov_b64 s[2:3], -1
+; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: .LBB15_3: ; %Flow
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccnz .LBB15_5
; VI-NEXT: ; %bb.4: ; %if
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: s_branch .LBB15_6
; VI-NEXT: .LBB15_5:
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: .LBB15_6: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul32_in_branch:
@@ -2216,102 +2219,102 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB15_2
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_mul_i32 s6, s2, s3
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_mul_i32 s8, s2, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_branch .LBB15_3
; GFX9-NEXT: .LBB15_2:
-; GFX9-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: s_mov_b64 s[2:3], -1
+; GFX9-NEXT: ; implicit-def: $sgpr8
; GFX9-NEXT: .LBB15_3: ; %Flow
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB15_5
; GFX9-NEXT: ; %bb.4: ; %if
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_branch .LBB15_6
; GFX9-NEXT: .LBB15_5:
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: .LBB15_6: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mul32_in_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB15_2
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_mul_i32 s5, s2, s3
+; GFX10-NEXT: s_mul_i32 s2, s2, s3
; GFX10-NEXT: s_branch .LBB15_3
; GFX10-NEXT: .LBB15_2:
-; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: ; implicit-def: $sgpr5
+; GFX10-NEXT: s_mov_b32 s8, -1
+; GFX10-NEXT: ; implicit-def: $sgpr2
; GFX10-NEXT: .LBB15_3: ; %Flow
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_vccnz .LBB15_5
; GFX10-NEXT: ; %bb.4: ; %if
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s0, s6
+; GFX10-NEXT: s_mov_b32 s1, s7
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX10-NEXT: s_branch .LBB15_6
; GFX10-NEXT: .LBB15_5:
-; GFX10-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: .LBB15_6: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mul32_in_branch:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB15_2
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_mul_i32 s5, s2, s3
+; GFX11-NEXT: s_mul_i32 s2, s2, s3
; GFX11-NEXT: s_branch .LBB15_3
; GFX11-NEXT: .LBB15_2:
-; GFX11-NEXT: s_mov_b32 s4, -1
-; GFX11-NEXT: ; implicit-def: $sgpr5
+; GFX11-NEXT: s_mov_b32 s8, -1
+; GFX11-NEXT: ; implicit-def: $sgpr2
; GFX11-NEXT: .LBB15_3: ; %Flow
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX11-NEXT: s_cbranch_vccnz .LBB15_5
; GFX11-NEXT: ; %bb.4: ; %if
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_branch .LBB15_6
; GFX11-NEXT: .LBB15_5:
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: .LBB15_6: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2319,36 +2322,36 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-LABEL: mul32_in_branch:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cbranch_scc0 .LBB15_2
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_mul_i32 s5, s2, s3
+; GFX12-NEXT: s_mul_i32 s2, s2, s3
; GFX12-NEXT: s_branch .LBB15_3
; GFX12-NEXT: .LBB15_2:
-; GFX12-NEXT: s_mov_b32 s4, -1
-; GFX12-NEXT: ; implicit-def: $sgpr5
+; GFX12-NEXT: s_mov_b32 s8, -1
+; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: .LBB15_3: ; %Flow
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX12-NEXT: s_cbranch_vccnz .LBB15_5
; GFX12-NEXT: ; %bb.4: ; %if
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s2
-; GFX12-NEXT: s_mov_b32 s5, s3
-; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s0, s6
+; GFX12-NEXT: s_mov_b32 s1, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_branch .LBB15_6
; GFX12-NEXT: .LBB15_5:
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: .LBB15_6: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2445,31 +2448,31 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: mul64_in_branch:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
; VI-NEXT: s_cbranch_scc0 .LBB16_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
-; VI-NEXT: s_mul_i32 s4, s4, s7
-; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; VI-NEXT: s_mul_i32 s4, s5, s6
-; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s8, v0, 0
+; VI-NEXT: s_mul_i32 s2, s8, s11
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: s_mul_i32 s2, s9, s10
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; VI-NEXT: s_cbranch_vccnz .LBB16_3
; VI-NEXT: .LBB16_2: ; %if
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT: .LBB16_3: ; %endif
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: .LBB16_3: ; %endif
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB16_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2477,135 +2480,136 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: mul64_in_branch:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB16_3
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_mul_i32 s7, s4, s7
-; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6
-; GFX9-NEXT: s_add_i32 s7, s10, s7
-; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s5, s7, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s6
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT: s_mul_i32 s2, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s3, s8, s10
+; GFX9-NEXT: s_add_i32 s2, s3, s2
+; GFX9-NEXT: s_mul_i32 s3, s9, s10
+; GFX9-NEXT: s_add_i32 s3, s2, s3
+; GFX9-NEXT: s_mul_i32 s2, s8, s10
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB16_4
; GFX9-NEXT: .LBB16_2: ; %if
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_branch .LBB16_5
; GFX9-NEXT: .LBB16_3:
-; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX9-NEXT: s_branch .LBB16_2
; GFX9-NEXT: .LBB16_4:
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: .LBB16_5: ; %endif
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mul64_in_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB16_3
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_mul_i32 s7, s4, s7
-; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX10-NEXT: s_mul_i32 s5, s5, s6
-; GFX10-NEXT: s_add_i32 s7, s8, s7
-; GFX10-NEXT: s_mul_i32 s4, s4, s6
-; GFX10-NEXT: s_add_i32 s5, s7, s5
+; GFX10-NEXT: s_mul_i32 s0, s8, s11
+; GFX10-NEXT: s_mul_hi_u32 s1, s8, s10
+; GFX10-NEXT: s_mul_i32 s2, s9, s10
+; GFX10-NEXT: s_add_i32 s0, s1, s0
+; GFX10-NEXT: s_add_i32 s1, s0, s2
+; GFX10-NEXT: s_mul_i32 s0, s8, s10
; GFX10-NEXT: s_cbranch_execnz .LBB16_4
; GFX10-NEXT: .LBB16_2: ; %if
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s4, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s0, s6
+; GFX10-NEXT: s_mov_b32 s1, s7
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_branch .LBB16_5
; GFX10-NEXT: .LBB16_3:
-; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB16_2
; GFX10-NEXT: .LBB16_4:
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: .LBB16_5: ; %endif
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mul64_in_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX11-NEXT: s_cbranch_scc0 .LBB16_3
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_mul_i32 s7, s4, s7
-; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX11-NEXT: s_mul_i32 s5, s5, s6
-; GFX11-NEXT: s_add_i32 s7, s8, s7
-; GFX11-NEXT: s_mul_i32 s4, s4, s6
-; GFX11-NEXT: s_add_i32 s5, s7, s5
+; GFX11-NEXT: s_mul_i32 s0, s8, s11
+; GFX11-NEXT: s_mul_hi_u32 s1, s8, s10
+; GFX11-NEXT: s_mul_i32 s2, s9, s10
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s0, s2
+; GFX11-NEXT: s_mul_i32 s0, s8, s10
; GFX11-NEXT: s_cbranch_execnz .LBB16_4
; GFX11-NEXT: .LBB16_2: ; %if
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_branch .LBB16_5
; GFX11-NEXT: .LBB16_3:
-; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX11-NEXT: s_branch .LBB16_2
; GFX11-NEXT: .LBB16_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: .LBB16_5: ; %endif
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: mul64_in_branch:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX12-NEXT: s_cbranch_scc0 .LBB16_3
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[8:9], s[10:11]
; GFX12-NEXT: s_cbranch_execnz .LBB16_4
; GFX12-NEXT: .LBB16_2: ; %if
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s4, s2
-; GFX12-NEXT: s_mov_b32 s5, s3
-; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s0, s6
+; GFX12-NEXT: s_mov_b32 s1, s7
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_branch .LBB16_5
; GFX12-NEXT: .LBB16_3:
-; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX12-NEXT: s_branch .LBB16_2
; GFX12-NEXT: .LBB16_4:
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: .LBB16_5: ; %endif
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2724,41 +2728,41 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
;
; VI-LABEL: s_mul_i128:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v5, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
-; VI-NEXT: s_mul_i32 s7, s8, s7
-; VI-NEXT: v_mov_b32_e32 v6, s8
-; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3
-; VI-NEXT: s_mul_i32 s12, s9, s6
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
-; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v0, 0
+; VI-NEXT: s_mul_i32 s0, s12, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; VI-NEXT: s_mul_i32 s2, s13, s10
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3
; VI-NEXT: v_mov_b32_e32 v4, v1
-; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v6, v[4:5]
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v8, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v7
; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: v_mov_b32_e32 v8, s9
-; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
-; VI-NEXT: s_mul_i32 s8, s11, s4
-; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v8, s13
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v8, v[6:7]
+; VI-NEXT: s_mul_i32 s2, s15, s8
+; VI-NEXT: v_add_u32_e32 v6, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v2, v5
; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
-; VI-NEXT: s_mul_i32 s8, s10, s5
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
-; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6
+; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v8, v[2:3]
+; VI-NEXT: s_mul_i32 s2, s14, s9
+; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v6
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; VI-NEXT: v_mov_b32_e32 v1, v4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i128:
@@ -2813,53 +2817,53 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
;
; GFX10-LABEL: s_mul_i128:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
+; GFX10-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s13, s2
+; GFX10-NEXT: s_mov_b32 s1, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s3, s8, s7
-; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX10-NEXT: s_mul_i32 s0, s8, s7
+; GFX10-NEXT: s_mul_hi_u32 s3, s8, s6
; GFX10-NEXT: s_mul_i32 s14, s10, s5
; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4
-; GFX10-NEXT: s_mul_i32 s12, s9, s6
+; GFX10-NEXT: s_mul_i32 s7, s9, s6
; GFX10-NEXT: s_mul_i32 s11, s11, s4
-; GFX10-NEXT: s_add_i32 s3, s7, s3
-; GFX10-NEXT: s_add_i32 s7, s15, s14
+; GFX10-NEXT: s_add_i32 s0, s3, s0
+; GFX10-NEXT: s_add_i32 s3, s15, s14
; GFX10-NEXT: s_mul_i32 s6, s8, s6
; GFX10-NEXT: s_mul_i32 s10, s10, s4
-; GFX10-NEXT: s_add_i32 s3, s3, s12
-; GFX10-NEXT: s_add_i32 s7, s7, s11
+; GFX10-NEXT: s_add_i32 s0, s0, s7
+; GFX10-NEXT: s_add_i32 s3, s3, s11
; GFX10-NEXT: s_mul_i32 s19, s5, s8
; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX10-NEXT: s_add_u32 s6, s10, s6
; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8
-; GFX10-NEXT: s_addc_u32 s7, s7, s3
+; GFX10-NEXT: s_addc_u32 s7, s3, s0
; GFX10-NEXT: s_mul_i32 s17, s4, s9
-; GFX10-NEXT: s_add_u32 s3, s19, s20
+; GFX10-NEXT: s_add_u32 s0, s19, s20
; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9
; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9
; GFX10-NEXT: s_mul_i32 s5, s5, s9
; GFX10-NEXT: s_addc_u32 s9, s18, 0
-; GFX10-NEXT: s_add_u32 s3, s17, s3
+; GFX10-NEXT: s_add_u32 s3, s17, s0
; GFX10-NEXT: s_addc_u32 s10, s16, 0
-; GFX10-NEXT: s_mul_i32 s12, s4, s8
+; GFX10-NEXT: s_mul_i32 s0, s4, s8
; GFX10-NEXT: s_add_u32 s4, s9, s10
; GFX10-NEXT: s_addc_u32 s8, 0, 0
; GFX10-NEXT: s_add_u32 s4, s5, s4
; GFX10-NEXT: s_addc_u32 s5, s21, s8
; GFX10-NEXT: s_add_u32 s4, s4, s6
; GFX10-NEXT: s_addc_u32 s5, s5, s7
-; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v3, s5
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-NEXT: s_mov_b32 s15, 0x31016000
+; GFX10-NEXT: s_mov_b32 s14, -1
+; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_i128:
@@ -2867,50 +2871,50 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c
; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s2, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s13, s2
+; GFX11-NEXT: s_mov_b32 s1, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s8, s7
-; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX11-NEXT: s_mul_i32 s0, s8, s7
+; GFX11-NEXT: s_mul_hi_u32 s3, s8, s6
; GFX11-NEXT: s_mul_i32 s14, s10, s5
; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4
-; GFX11-NEXT: s_mul_i32 s12, s9, s6
+; GFX11-NEXT: s_mul_i32 s7, s9, s6
; GFX11-NEXT: s_mul_i32 s11, s11, s4
-; GFX11-NEXT: s_add_i32 s3, s7, s3
-; GFX11-NEXT: s_add_i32 s7, s15, s14
+; GFX11-NEXT: s_add_i32 s0, s3, s0
+; GFX11-NEXT: s_add_i32 s3, s15, s14
; GFX11-NEXT: s_mul_i32 s6, s8, s6
; GFX11-NEXT: s_mul_i32 s10, s10, s4
-; GFX11-NEXT: s_add_i32 s3, s3, s12
-; GFX11-NEXT: s_add_i32 s7, s7, s11
+; GFX11-NEXT: s_add_i32 s0, s0, s7
+; GFX11-NEXT: s_add_i32 s3, s3, s11
; GFX11-NEXT: s_mul_i32 s19, s5, s8
; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX11-NEXT: s_add_u32 s6, s10, s6
; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8
-; GFX11-NEXT: s_addc_u32 s7, s7, s3
+; GFX11-NEXT: s_addc_u32 s7, s3, s0
; GFX11-NEXT: s_mul_i32 s17, s4, s9
-; GFX11-NEXT: s_add_u32 s3, s19, s20
+; GFX11-NEXT: s_add_u32 s0, s19, s20
; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9
; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9
; GFX11-NEXT: s_mul_i32 s5, s5, s9
; GFX11-NEXT: s_addc_u32 s9, s18, 0
-; GFX11-NEXT: s_add_u32 s3, s17, s3
+; GFX11-NEXT: s_add_u32 s3, s17, s0
; GFX11-NEXT: s_addc_u32 s10, s16, 0
-; GFX11-NEXT: s_mul_i32 s12, s4, s8
+; GFX11-NEXT: s_mul_i32 s0, s4, s8
; GFX11-NEXT: s_add_u32 s4, s9, s10
; GFX11-NEXT: s_addc_u32 s8, 0, 0
; GFX11-NEXT: s_add_u32 s4, s5, s4
; GFX11-NEXT: s_addc_u32 s5, s21, s8
; GFX11-NEXT: s_add_u32 s4, s4, s6
; GFX11-NEXT: s_addc_u32 s5, s5, s7
-; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, -1
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[12:15], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2918,44 +2922,44 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-LABEL: s_mul_i128:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c
-; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c
+; GFX12-NEXT: s_load_b128 s[12:15], s[0:1], 0x4c
; GFX12-NEXT: s_mov_b32 s3, 0
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s15, s3
-; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, s3
+; GFX12-NEXT: s_mov_b32 s1, s3
; GFX12-NEXT: s_mov_b32 s17, s3
; GFX12-NEXT: s_mov_b32 s19, s3
; GFX12-NEXT: s_mov_b32 s24, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, s4
-; GFX12-NEXT: s_mov_b32 s14, s8
-; GFX12-NEXT: s_mov_b32 s12, s9
-; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3]
-; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3]
+; GFX12-NEXT: s_mov_b32 s2, s8
+; GFX12-NEXT: s_mov_b32 s6, s12
+; GFX12-NEXT: s_mov_b32 s0, s13
+; GFX12-NEXT: s_mul_u64 s[22:23], s[6:7], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[20:21], s[0:1], s[2:3]
; GFX12-NEXT: s_mov_b32 s2, s23
-; GFX12-NEXT: s_mov_b32 s16, s5
-; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11]
-; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3]
-; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9]
-; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17]
-; GFX12-NEXT: s_mov_b32 s2, s11
-; GFX12-NEXT: s_mov_b32 s11, s3
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5]
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11]
-; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17]
+; GFX12-NEXT: s_mov_b32 s16, s9
+; GFX12-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13]
+; GFX12-NEXT: s_add_nc_u64 s[12:13], s[20:21], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17]
+; GFX12-NEXT: s_mov_b32 s2, s13
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15]
+; GFX12-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[12:13]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[16:17]
; GFX12-NEXT: s_mov_b32 s18, s7
; GFX12-NEXT: s_mov_b32 s23, s3
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19]
+; GFX12-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9]
; GFX12-NEXT: s_mov_b32 s25, s6
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25]
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[8:9]
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
-; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3067,15 +3071,15 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
;
; VI-LABEL: v_mul_i128:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9]
@@ -3107,12 +3111,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2
@@ -3133,18 +3137,18 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0
; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2
@@ -3165,17 +3169,17 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i128:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1]
-; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v15, s[4:5]
+; GFX11-NEXT: global_load_b128 v[4:7], v15, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0
; GFX11-NEXT: v_mul_lo_u32 v14, v5, v2
@@ -3201,19 +3205,19 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13
; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
-; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3]
+; GFX11-NEXT: global_store_b128 v15, v[8:11], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1]
-; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v13, s[4:5]
+; GFX12-NEXT: global_load_b128 v[4:7], v13, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2
@@ -3240,7 +3244,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3]
+; GFX12-NEXT: global_store_b128 v13, v[8:11], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 6d7bf00..4770b44 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -291,18 +291,18 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32
;
; VI-LABEL: test_smul24_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x70
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dword s3, s[0:1], 0x70
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: s_bfe_i32 s5, s5, 0x180000
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0
-; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
+; VI-NEXT: s_bfe_i32 s1, s3, 0x180000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0
+; VI-NEXT: v_mul_i32_i24_e32 v0, s1, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64:
@@ -390,15 +390,15 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
;
; VI-LABEL: test_smul24_i64_square:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
-; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
+; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s0, s0
+; VI-NEXT: v_mul_i32_i24_e64 v0, s0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64_square:
@@ -485,21 +485,21 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s3, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 8
-; VI-NEXT: s_lshl_b32 s5, s4, 8
-; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
+; VI-NEXT: s_lshl_b32 s1, s2, 8
+; VI-NEXT: s_lshl_b32 s3, s3, 8
; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
-; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
+; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
+; VI-NEXT: v_mul_i32_i24_e32 v0, s0, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i33:
@@ -594,16 +594,16 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i33:
@@ -702,16 +702,16 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0,
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB8_2: ; %bb11
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: s_bfe_i32 s5, s6, 0x180000
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s4, 0x180000
+; VI-NEXT: s_bfe_i32 s1, s6, 0x180000
+; VI-NEXT: s_mul_i32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: simplify_i24_crash:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index e6470a5..7c43c0b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -78,16 +78,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i
;
; VI-LABEL: test_umul24_i16_sext:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: s_sext_i32_i16 s4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_mul_i32 s2, s2, s0
+; VI-NEXT: s_sext_i32_i16 s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_sext:
@@ -136,40 +136,40 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
;
; VI-LABEL: test_umul24_i16_vgpr_sext:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr_sext:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -200,16 +200,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b
;
; VI-LABEL: test_umul24_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: s_and_b32 s4, s4, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_mul_i32 s2, s2, s0
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16:
@@ -258,38 +258,38 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_umul24_i16_vgpr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -331,13 +331,13 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: test_umul24_i8_vgpr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
@@ -596,14 +596,14 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3
;
; VI-LABEL: test_umul24_i64_square:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
-; VI-NEXT: v_mul_u32_u24_e64 v0, s4, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s2, s2
+; VI-NEXT: v_mul_u32_u24_e64 v0, s2, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i64_square:
@@ -703,17 +703,17 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
;
; VI-LABEL: test_umul24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mul_u32_u24_e32 v0, s5, v1
-; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s5, v1
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mul_u32_u24_e32 v0, s3, v1
+; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s3, v1
; VI-NEXT: v_and_b32_e32 v1, 1, v1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i33:
@@ -761,16 +761,16 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
;
; VI-LABEL: test_umulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi24_i33:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 9ab3ecc..28f6c13 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -2104,10 +2104,10 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2115,10 +2115,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 1
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 1
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2128,9 +2128,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2138,9 +2138,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2154,10 +2154,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2165,10 +2165,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2178,9 +2178,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2188,9 +2188,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2204,10 +2204,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2215,10 +2215,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2228,9 +2228,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2238,9 +2238,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2254,10 +2254,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2267,10 +2267,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2280,11 +2280,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2292,9 +2292,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2302,10 +2302,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2334,10 +2334,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2347,10 +2347,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2360,11 +2360,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2372,9 +2372,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2382,10 +2382,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2395,10 +2395,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2414,10 +2414,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2427,10 +2427,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2440,11 +2440,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2452,9 +2452,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2462,10 +2462,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2475,10 +2475,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2494,10 +2494,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2507,10 +2507,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2520,11 +2520,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2532,9 +2532,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2542,10 +2542,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2555,10 +2555,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2574,10 +2574,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2585,10 +2585,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2598,9 +2598,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2608,9 +2608,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2624,10 +2624,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2637,10 +2637,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2650,11 +2650,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2662,9 +2662,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2672,10 +2672,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2685,10 +2685,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2704,10 +2704,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2717,10 +2717,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2730,11 +2730,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2742,9 +2742,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2752,10 +2752,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2765,10 +2765,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2784,10 +2784,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2797,10 +2797,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2810,11 +2810,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2822,9 +2822,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2832,10 +2832,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2845,10 +2845,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2864,10 +2864,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2877,10 +2877,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2890,11 +2890,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2902,9 +2902,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2912,10 +2912,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2925,10 +2925,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2944,10 +2944,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2957,10 +2957,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2970,11 +2970,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2982,9 +2982,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2992,10 +2992,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3005,10 +3005,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3025,10 +3025,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3037,10 +3037,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3050,11 +3050,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3062,11 +3062,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3074,10 +3074,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3087,10 +3087,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3100,10 +3100,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3120,10 +3120,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3132,10 +3132,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3145,11 +3145,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3157,11 +3157,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3169,10 +3169,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3182,10 +3182,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3195,10 +3195,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3215,10 +3215,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3227,10 +3227,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3240,11 +3240,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3252,11 +3252,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3264,10 +3264,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3277,10 +3277,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3290,10 +3290,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3310,10 +3310,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3323,10 +3323,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3336,11 +3336,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3348,11 +3348,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3360,10 +3360,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3373,10 +3373,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3386,10 +3386,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3406,10 +3406,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -3419,10 +3419,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3432,11 +3432,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3444,11 +3444,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3456,10 +3456,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3469,10 +3469,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3482,10 +3482,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3502,10 +3502,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3515,10 +3515,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3528,11 +3528,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3540,11 +3540,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3552,10 +3552,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3565,10 +3565,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3578,10 +3578,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3598,11 +3598,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3612,10 +3612,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3625,10 +3625,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3638,10 +3638,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
@@ -3651,10 +3651,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3664,10 +3664,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3677,10 +3677,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3697,11 +3697,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3711,10 +3711,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3724,10 +3724,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3737,10 +3737,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
@@ -3750,10 +3750,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3763,10 +3763,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3776,10 +3776,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3796,11 +3796,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3810,10 +3810,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3823,10 +3823,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3836,10 +3836,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
@@ -3849,10 +3849,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3862,10 +3862,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3875,10 +3875,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3895,11 +3895,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3909,10 +3909,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3922,10 +3922,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3935,10 +3935,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
@@ -3948,10 +3948,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3961,10 +3961,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3974,10 +3974,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3994,11 +3994,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4008,10 +4008,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -4021,10 +4021,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4034,10 +4034,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
@@ -4047,10 +4047,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4060,10 +4060,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4073,10 +4073,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -4093,11 +4093,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4107,10 +4107,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -4120,10 +4120,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4133,10 +4133,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
@@ -4146,10 +4146,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4159,10 +4159,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4172,10 +4172,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 10381bc..8dcca32 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -2176,30 +2176,30 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2208,10 +2208,10 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
;
; GFX12-LABEL: global_inst_salu_offset_1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2226,30 +2226,30 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2258,10 +2258,10 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2276,30 +2276,30 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2308,10 +2308,10 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2326,30 +2326,30 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_13bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2358,10 +2358,10 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2376,30 +2376,30 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2408,10 +2408,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2426,20 +2426,20 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2449,10 +2449,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2461,10 +2461,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2473,10 +2473,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2490,11 +2490,11 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2502,10 +2502,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2515,10 +2515,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2530,10 +2530,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2542,10 +2542,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2553,11 +2553,11 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2573,30 +2573,30 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2605,10 +2605,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2623,30 +2623,30 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2655,10 +2655,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2673,30 +2673,30 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2705,10 +2705,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:16383 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2723,20 +2723,20 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2746,10 +2746,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2758,10 +2758,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2770,10 +2770,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2787,11 +2787,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2799,10 +2799,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2812,10 +2812,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2827,10 +2827,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2839,10 +2839,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2850,11 +2850,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2870,11 +2870,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2882,10 +2882,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2895,10 +2895,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2910,10 +2910,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2922,10 +2922,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2933,11 +2933,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2954,11 +2954,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2966,10 +2966,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2979,10 +2979,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2994,10 +2994,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3009,10 +3009,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3020,11 +3020,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3034,11 +3034,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3055,11 +3055,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3067,10 +3067,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3080,10 +3080,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3095,10 +3095,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3110,10 +3110,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3121,11 +3121,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3135,11 +3135,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3156,11 +3156,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3168,10 +3168,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3181,10 +3181,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3196,10 +3196,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3211,10 +3211,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3222,11 +3222,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3236,11 +3236,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3257,11 +3257,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3269,10 +3269,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3282,10 +3282,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3297,10 +3297,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3312,10 +3312,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3323,11 +3323,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3337,11 +3337,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3358,11 +3358,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3370,10 +3370,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3383,10 +3383,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3398,10 +3398,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3413,10 +3413,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3424,11 +3424,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3438,11 +3438,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3459,11 +3459,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3471,10 +3471,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3484,10 +3484,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3499,10 +3499,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3514,10 +3514,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3525,11 +3525,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3539,11 +3539,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3560,11 +3560,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3572,11 +3572,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3584,11 +3584,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3598,11 +3598,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3612,12 +3612,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x7ff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3634,11 +3634,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3646,11 +3646,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3658,11 +3658,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3672,11 +3672,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3686,12 +3686,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x800
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3708,11 +3708,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3720,11 +3720,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3732,11 +3732,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3746,11 +3746,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3760,12 +3760,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xfff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3782,11 +3782,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3794,11 +3794,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3806,11 +3806,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3820,11 +3820,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3834,12 +3834,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1000
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3856,11 +3856,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3868,11 +3868,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3880,11 +3880,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3894,11 +3894,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3908,12 +3908,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1fff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3930,11 +3930,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3942,11 +3942,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3954,11 +3954,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3968,11 +3968,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3982,12 +3982,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x2000
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index 769d035..48259163 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -25,15 +25,15 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
;
; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -43,30 +43,30 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
;
; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -101,15 +101,15 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
;
; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -119,30 +119,30 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
;
; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -177,15 +177,15 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
;
; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -195,30 +195,30 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
;
; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -253,15 +253,15 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
;
; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -271,30 +271,30 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
;
; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
index bd7f901..5b755d0a 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
@@ -62,15 +62,15 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p)
define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) {
; GCN-LABEL: if_masked_0x8000000000000000:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s5, 0x80000000
; GCN-NEXT: s_cmp_eq_u64 s[0:1], 0
; GCN-NEXT: s_cselect_b32 s0, 22, 33
; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN-NEXT: global_store_dword v0, v1, s[6:7]
; GCN-NEXT: s_endpgm
%and = and i64 %arg, 9223372036854775808
%cmp = icmp eq i64 %and, 0
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 65f4a1b..63e9e60 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -25,21 +25,21 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; GFX8-LABEL: or_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: or_v2i32:
@@ -92,24 +92,24 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; GFX8-LABEL: or_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX8-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: or_v4i32:
@@ -258,14 +258,14 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
;
; GFX8-LABEL: scalar_or_literal_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, 0x1869f
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, 0x1869f
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_literal_i32:
@@ -300,16 +300,16 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
;
; GFX8-LABEL: scalar_or_literal_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s5, s5, 0xf237b
-; GFX8-NEXT: s_or_b32 s4, s4, 0x3039
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s3, 0xf237b
+; GFX8-NEXT: s_or_b32 s1, s2, 0x3039
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_literal_i64:
@@ -357,18 +357,18 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74
-; GFX8-NEXT: s_movk_i32 s8, 0x3039
-; GFX8-NEXT: s_mov_b32 s9, 0xf237b
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x74
+; GFX8-NEXT: s_movk_i32 s0, 0x3039
+; GFX8-NEXT: s_mov_b32 s1, 0xf237b
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x3039
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s8, 0x3039
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b
+; GFX8-NEXT: s_addc_u32 s1, s9, 0xf237b
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -421,15 +421,15 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
;
; GFX8-LABEL: scalar_or_inline_imm_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, 63
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, 63
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_inline_imm_i64:
@@ -534,15 +534,15 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
;
; GFX8-LABEL: scalar_or_neg_inline_imm_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, -8
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, -8
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_neg_inline_imm_i64:
@@ -583,20 +583,20 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: vector_or_literal_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_literal_i32:
@@ -642,20 +642,20 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
;
; GFX8-LABEL: vector_or_inline_immediate_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_inline_immediate_i32:
@@ -886,21 +886,21 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: vector_or_i64_loadimm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, 0x146f, v1
; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_loadimm:
@@ -949,20 +949,20 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: vector_or_i64_imm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 8, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_imm:
@@ -1009,21 +1009,21 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
;
; GFX8-LABEL: vector_or_i64_neg_inline_imm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, -8, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_neg_inline_imm:
@@ -1072,21 +1072,21 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: vector_or_i64_neg_literal:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_neg_literal:
@@ -1129,15 +1129,15 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
;
; GFX8-LABEL: trunc_i64_or_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX8-NEXT: s_load_dword s3, s[0:1], 0x74
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s5, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s3, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: trunc_i64_or_to_i32:
@@ -1261,17 +1261,17 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8-LABEL: s_or_i1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_cmp_eq_u32 s4, s5
-; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8-NEXT: s_cmp_eq_u32 s6, s7
-; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8-NEXT: buffer_store_byte v0, off, s[8:11], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: s_or_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index 1899a0ab..4048994 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -299,15 +299,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -330,15 +330,15 @@ define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -362,14 +362,14 @@ define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1]
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1
@@ -389,15 +389,15 @@ define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, pt
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -420,15 +420,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -453,15 +453,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out,
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -483,15 +483,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %o
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -514,15 +514,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -544,15 +544,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -574,15 +574,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -604,15 +604,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -634,16 +634,16 @@ define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -661,16 +661,16 @@ define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr a
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0]
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -689,18 +689,18 @@ define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0xffff
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_add_u16 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -726,14 +726,14 @@ define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addr
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -764,7 +764,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: ; kill: killed $vgpr0_vgpr1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
@@ -776,7 +776,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
index 4794c29..8333386 100644
--- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 {
; GCN-LABEL: dbg_clause:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dword v1, v0, s[2:3]
+; GCN-NEXT: global_load_dword v1, v0, s[6:7]
; GCN-NEXT: ;DEBUG_VALUE: foo:a <- $vgpr1
-; GCN-NEXT: global_load_dword v2, v0, s[2:3] offset:32
+; GCN-NEXT: global_load_dword v2, v0, s[6:7] offset:32
; GCN-NEXT: ;DEBUG_VALUE: foo:b <- $vgpr2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, v1, v2
-; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: global_store_dword v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 3f8b64b..0747760 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -519,11 +519,11 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a
; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
@@ -534,11 +534,11 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a
; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
index dabb9d4..0c8dbd1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
@@ -21,7 +21,7 @@ define protected amdgpu_kernel void @load_v3i32_align8(ptr addrspace(1) %arg) #0
; GCN-LABEL: load_v3i32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
%vec = load <3 x i32>, ptr addrspace(1) %arg, align 8
store <3 x i32> %vec, ptr addrspace(1) undef, align 8
ret void
@@ -52,7 +52,7 @@ define protected amdgpu_kernel void @load_v3f32_align8(ptr addrspace(1) %arg) #0
; GCN-LABEL: load_v3f32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
%vec = load <3 x float>, ptr addrspace(1) %arg, align 8
store <3 x float> %vec, ptr addrspace(1) undef, align 8
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 2ce0b9e..a82f301 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -110,46 +110,46 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) {
; SDAG-LABEL: buffers_might_alias:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: buffers_might_alias:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12
; GISEL-NEXT: s_endpgm
%l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
%s0 = fmul float %l0, %l0
@@ -173,28 +173,28 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac
define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) {
; SDAG-LABEL: independent_offsets:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SDAG-NEXT: v_mov_b32_e32 v2, 1.0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; SDAG-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; SDAG-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: independent_offsets:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GISEL-NEXT: v_mov_b32_e32 v2, 1.0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GISEL-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GISEL-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
; GISEL-NEXT: s_endpgm
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%idx = shl i32 %lane, 2
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 74bad5ea..5be6082 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -759,12 +759,12 @@ define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src)
;
; VI-LABEL: s_rcp_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -811,12 +811,12 @@ define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float
;
; VI-LABEL: s_rcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -863,12 +863,12 @@ define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, f
;
; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -915,12 +915,12 @@ define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, f
;
; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -967,12 +967,12 @@ define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1)
;
; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1019,12 +1019,12 @@ define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %
;
; VI-LABEL: s_rcp_fabs_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, |s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, |s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1072,12 +1072,12 @@ define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %s
;
; VI-LABEL: s_neg_rcp_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, -s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1127,12 +1127,12 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, fl
;
; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1188,13 +1188,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1
;
; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2|
+; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4|
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
@@ -1254,12 +1254,12 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0
;
; VI-LABEL: s_div_arcp_2_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1309,13 +1309,13 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0
;
; VI-LABEL: s_div_arcp_k_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1366,13 +1366,13 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out)
;
; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 4a00473..9494b3c 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -112,16 +112,16 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-LABEL: rotl_v2i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: s_sub_i32 s3, 32, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sub_i32 s0, 32, s6
+; GFX8-NEXT: s_sub_i32 s1, 32, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -143,14 +143,14 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, 32, s7
-; GFX11-NEXT: s_sub_i32 s3, 32, s6
-; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s2
-; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sub_i32 s0, 32, s7
+; GFX11-NEXT: s_sub_i32 s1, 32, s6
+; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s0
+; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -211,22 +211,22 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-LABEL: rotl_v4i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s3, 32, s9
+; GFX8-NEXT: s_sub_i32 s1, 32, s9
; GFX8-NEXT: s_sub_i32 s9, 32, s11
-; GFX8-NEXT: s_sub_i32 s2, 32, s8
+; GFX8-NEXT: s_sub_i32 s0, 32, s8
; GFX8-NEXT: s_sub_i32 s8, 32, s10
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_alignbit_b32 v3, s7, s7, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -252,18 +252,18 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_sub_i32 s8, 32, s11
; GFX11-NEXT: s_sub_i32 s9, 32, s10
; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s8
; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s9
-; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s2
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s1
+; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s0
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6431d7..f9da328 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -99,14 +99,14 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-LABEL: rotr_v2i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -126,12 +126,12 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7
; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s6
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,7 +180,7 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-LABEL: rotr_v4i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v1, s10
@@ -189,9 +189,9 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v1
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v4
; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -213,14 +213,14 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11
; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s10
; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s9
; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s8
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index bd3c422..acacf76 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -39,18 +39,18 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-LABEL: saddo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_add_u32 s2, s6, s0
+; VI-NEXT: s_add_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_addc_u32 s3, s7, s1
-; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT: s_addc_u32 s1, s7, s3
+; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -99,19 +99,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s6, s0
-; GFX11-NEXT: s_addc_u32 s3, s7, s1
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, s0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_xor_b32 s2, s2, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -155,17 +155,17 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_saddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_add_i32 s4, s0, s1
-; VI-NEXT: s_cmp_lt_i32 s1, 0
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: s_cmp_lt_i32 s4, s0
+; VI-NEXT: s_add_i32 s4, s2, s3
+; VI-NEXT: s_cmp_lt_i32 s3, 0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lt_i32 s4, s2
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dword v[0:1], v4
@@ -208,18 +208,18 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-LABEL: s_saddo_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp
-; GFX11-NEXT: s_add_i32 s4, s4, s5
+; GFX11-NEXT: v_add_nc_i32 v0, s2, s3 clamp
+; GFX11-NEXT: s_add_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
-; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v1, v2, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -264,18 +264,18 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_saddo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5
@@ -288,45 +288,45 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_saddo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_saddo_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_load_dword v2, v0, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT: global_store_byte v0, v2, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_saddo_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-NEXT: global_load_b32 v2, v0, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_i32 v3, v1, v2 clamp
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2
@@ -334,8 +334,8 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -379,21 +379,21 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_saddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -401,56 +401,56 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_saddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s8, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s9, s5, s7
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v2, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_saddo_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s8, s4, s6
-; GFX10-NEXT: s_addc_u32 s9, s5, s7
-; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_xor_b32 s4, s6, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX10-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX10-NEXT: s_add_u32 s0, s8, s10
+; GFX10-NEXT: s_addc_u32 s1, s9, s11
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_xor_b32 s0, s2, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT: global_store_byte v2, v3, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_saddo_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s8, s4, s6
-; GFX11-NEXT: s_addc_u32 s9, s5, s7
-; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v0, s8
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s4, s6, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_xor_b32 s0, s2, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -496,18 +496,18 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_saddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
@@ -627,18 +627,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_saddo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
@@ -656,11 +656,11 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_saddo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v5, v1, v3
; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
@@ -670,18 +670,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_saddo_v2i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3
; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp
@@ -691,18 +691,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1]
-; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[4:5]
+; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_saddo_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5]
-; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7]
+; GFX11-NEXT: global_load_b64 v[0:1], v5, s[8:9]
+; GFX11-NEXT: global_load_b64 v[2:3], v5, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v1, v3
; GFX11-NEXT: v_add_nc_i32 v1, v1, v3 clamp
@@ -714,8 +714,8 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1]
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
+; GFX11-NEXT: global_store_b64 v5, v[3:4], s[4:5]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 5260a48..ae1b191 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -26,22 +26,22 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: scalar_to_vector_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tmp1 = load i32, ptr addrspace(1) %in, align 4
%bc = bitcast i32 %tmp1 to <2 x i16>
@@ -73,22 +73,22 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: scalar_to_vector_v2f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tmp1 = load float, ptr addrspace(1) %in, align 4
%bc = bitcast float %tmp1 to <2 x i16>
@@ -230,13 +230,13 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero
;
; VI-LABEL: scalar_to_vector_test6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
%bc = bitcast <4 x i8> %newvec0 to <2 x half>
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 6372d74..ef8e194 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -220,44 +220,44 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_i32_4:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_i32_4:
@@ -316,48 +316,48 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
;
; TONGA-LABEL: slow_sdiv_i32_3435:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s2, 0x98a1930b
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_mul_hi_i32 v1, v0, s2
+; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0
+; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: slow_sdiv_i32_3435:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2
+; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: v_add_u32_e32 v0, v1, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: slow_sdiv_i32_3435:
@@ -462,17 +462,17 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s7, 0xf000
; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_mov_b32 s2, s6
+; TONGA-NEXT: s_mov_b32 s3, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s10
+; TONGA-NEXT: s_mov_b32 s1, s11
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: s_mov_b32 s4, s8
+; TONGA-NEXT: s_mov_b32 s5, s9
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3
@@ -707,17 +707,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
;
; TONGA-LABEL: sdiv_v2i32_4:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@@ -727,22 +727,22 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1
-; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@@ -752,7 +752,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_v2i32_4:
@@ -918,18 +918,18 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s11, 0xf000
; TONGA-NEXT: s_mov_b32 s10, -1
-; TONGA-NEXT: s_mov_b32 s6, s10
-; TONGA-NEXT: s_mov_b32 s7, s11
+; TONGA-NEXT: s_mov_b32 s2, s10
+; TONGA-NEXT: s_mov_b32 s3, s11
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; TONGA-NEXT: s_mov_b32 s8, s0
-; TONGA-NEXT: s_mov_b32 s9, s1
+; TONGA-NEXT: s_mov_b32 s0, s6
+; TONGA-NEXT: s_mov_b32 s1, s7
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; TONGA-NEXT: s_mov_b32 s8, s4
+; TONGA-NEXT: s_mov_b32 s9, s5
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
; TONGA-NEXT: s_waitcnt vmcnt(0)
@@ -1371,17 +1371,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: sdiv_v4i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
@@ -1399,7 +1399,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_v4i32_4:
@@ -1482,18 +1482,18 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; TONGA-LABEL: v_sdiv_i8:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1
; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v0
; TONGA-NEXT: s_waitcnt vmcnt(0)
@@ -1510,23 +1510,23 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: v_sdiv_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1
; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v_sdiv_i8:
@@ -2221,21 +2221,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
;
; TONGA-LABEL: scalarize_mulhs_4xi32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; TONGA-NEXT: s_mov_b32 s0, 0x1389c755
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: s_mov_b32 s4, 0x1389c755
+; TONGA-NEXT: s_mov_b32 s0, s6
+; TONGA-NEXT: s_mov_b32 s1, s7
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0
-; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0
-; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0
-; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0
+; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4
+; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4
+; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4
+; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4
; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@@ -2248,26 +2248,26 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: scalarize_mulhs_4xi32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_mov_b32 s0, 0x1389c755
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s4, 0x1389c755
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0
-; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0
-; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0
-; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0
+; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4
+; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4
+; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4
+; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@@ -2280,7 +2280,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: scalarize_mulhs_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 0f2eedb..b271a03 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -7,13 +7,13 @@
define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: add_shr_i32:
; NOSDWA: ; %bb.0:
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v2
@@ -22,13 +22,13 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX89-LABEL: add_shr_i32:
; GFX89: ; %bb.0:
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX89-NEXT: flat_store_dword v[0:1], v2
@@ -36,24 +36,24 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: add_shr_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_shr_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in, align 4
%shr = lshr i32 %a, 16
@@ -65,13 +65,13 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: sub_shr_i32:
; NOSDWA: ; %bb.0:
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_sub_u32_e32 v2, vcc, v3, v2
@@ -80,13 +80,13 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX89-LABEL: sub_shr_i32:
; GFX89: ; %bb.0:
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_sub_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX89-NEXT: flat_store_dword v[0:1], v2
@@ -94,24 +94,24 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: sub_shr_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_shr_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in, align 4
%shr = lshr i32 %a, 16
@@ -124,14 +124,14 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; NOSDWA-LABEL: mul_shr_i32:
; NOSDWA: ; %bb.0:
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
@@ -148,14 +148,14 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX89-LABEL: mul_shr_i32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -211,14 +211,14 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; NOSDWA-LABEL: mul_i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
@@ -232,14 +232,14 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; GFX89-LABEL: mul_i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
@@ -294,14 +294,14 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
@@ -320,14 +320,14 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -384,14 +384,14 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v4i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -416,14 +416,14 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v4i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -485,14 +485,14 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v8i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -529,14 +529,14 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v8i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -608,12 +608,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -626,12 +626,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -680,11 +680,11 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v2half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -704,12 +704,12 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v2half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -760,12 +760,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v4half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: v_mov_b32_e32 v4, s4
@@ -790,12 +790,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v4half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v4, s4
@@ -851,12 +851,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v8half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v4, s6
; NOSDWA-NEXT: v_mov_b32_e32 v5, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; NOSDWA-NEXT: v_mov_b32_e32 v8, s4
@@ -893,12 +893,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v8half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v4, s0
-; GFX89-NEXT: v_mov_b32_e32 v5, s1
+; GFX89-NEXT: v_mov_b32_e32 v4, s2
+; GFX89-NEXT: v_mov_b32_e32 v5, s3
; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX89-NEXT: v_mov_b32_e32 v8, s4
@@ -964,13 +964,13 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; NOSDWA-LABEL: mul_i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v2, s7
; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0
; NOSDWA-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v4, s1
-; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; NOSDWA-NEXT: v_mov_b32_e32 v4, s3
+; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; NOSDWA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; NOSDWA-NEXT: flat_load_ubyte v2, v[1:2]
; NOSDWA-NEXT: flat_load_ubyte v3, v[3:4]
@@ -984,13 +984,13 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; GFX89-LABEL: mul_i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v2, s7
; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0
; GFX89-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX89-NEXT: v_mov_b32_e32 v4, s1
-; GFX89-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; GFX89-NEXT: v_mov_b32_e32 v4, s3
+; GFX89-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; GFX89-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX89-NEXT: flat_load_ubyte v2, v[1:2]
; GFX89-NEXT: flat_load_ubyte v3, v[3:4]
@@ -1043,14 +1043,14 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v2i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
@@ -1071,14 +1071,14 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v2i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
@@ -1143,15 +1143,15 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v4i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
-; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -1183,14 +1183,14 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v4i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -1272,14 +1272,14 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v8i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1331,14 +1331,14 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v8i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1449,13 +1449,13 @@ entry:
define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
; NOSDWA-LABEL: sitofp_v2i16_to_v2f16:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_cvt_f16_i16_e32 v3, v3
@@ -1467,13 +1467,13 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; GFX89-LABEL: sitofp_v2i16_to_v2f16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f16_i16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX89-NEXT: v_cvt_f16_i16_e32 v2, v2
@@ -1483,29 +1483,29 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; GFX9-LABEL: sitofp_v2i16_to_v2f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f16_i16_e32 v2, v1
; GFX9-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sitofp_v2i16_to_v2f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f16_i16_e32 v2, v1
; GFX10-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) #0 {
@@ -1520,11 +1520,11 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mac_v2half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -1544,11 +1544,11 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mac_v2half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: flat_load_dword v3, v[0:1]
@@ -1605,15 +1605,15 @@ entry:
define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: immediate_mul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
-; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
+; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, 0x7b, v2
; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1625,16 +1625,16 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX89-LABEL: immediate_mul_v2i16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX89-NEXT: v_mov_b32_e32 v3, 0x141
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
-; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
+; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_mul_lo_u16_e32 v4, 0x7b, v2
; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1644,27 +1644,27 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: immediate_mul_v2i16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x141007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
-; GFX9-NEXT: s_mov_b32 s2, 0x141007b
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: immediate_mul_v2i16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x141007b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1680,14 +1680,14 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; NOSDWA-LABEL: mulmul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -1709,14 +1709,14 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX89-LABEL: mulmul_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -1778,12 +1778,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; NOSDWA-LABEL: add_bb_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dword v1, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -1803,12 +1803,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX89-LABEL: add_bb_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v1, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -1863,13 +1863,13 @@ store_label:
define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 {
; NOSDWA-LABEL: pulled_out_test:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s7
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_and_b32_e32 v4, 0xff, v0
; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 8, v0
@@ -1900,15 +1900,15 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
;
; GFX89-LABEL: pulled_out_test:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: v_mov_b32_e32 v4, 8
; GFX89-NEXT: v_mov_b32_e32 v5, 0xff
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v2, s2
-; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_mov_b32_e32 v2, s6
+; GFX89-NEXT: v_mov_b32_e32 v3, s7
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX89-NEXT: v_lshrrev_b32_e32 v7, 24, v0
@@ -1929,12 +1929,12 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
;
; GFX9-LABEL: pulled_out_test:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0
@@ -1950,18 +1950,18 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: pulled_out_test:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 8
; GFX10-NEXT: v_mov_b32_e32 v4, 24
; GFX10-NEXT: v_mov_b32_e32 v5, 0xff
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1975,7 +1975,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
entry:
%idxprom = ashr exact i64 15, 32
@@ -2207,11 +2207,11 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr
; NOSDWA-LABEL: mac_v2half_same_srcop:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -2231,12 +2231,12 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr
; GFX89-LABEL: mac_v2half_same_srcop:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 0992e9e3..53b78bd 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -85,17 +85,17 @@ define amdgpu_kernel void @select_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
-; GFX11-NEXT: s_mov_b32 s14, -1
-; GFX11-NEXT: s_mov_b32 s15, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s14
-; GFX11-NEXT: s_mov_b32 s19, s15
-; GFX11-NEXT: s_mov_b32 s22, s14
-; GFX11-NEXT: s_mov_b32 s23, s15
-; GFX11-NEXT: s_mov_b32 s26, s14
-; GFX11-NEXT: s_mov_b32 s27, s15
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
+; GFX11-NEXT: s_mov_b32 s26, s2
+; GFX11-NEXT: s_mov_b32 s27, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s16, s6
; GFX11-NEXT: s_mov_b32 s17, s7
@@ -109,13 +109,13 @@ define amdgpu_kernel void @select_f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v3, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s4
-; GFX11-NEXT: s_mov_b32 s13, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -174,64 +174,64 @@ define amdgpu_kernel void @select_f16_imm_a(
;
; VI-LABEL: select_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -287,64 +287,64 @@ define amdgpu_kernel void @select_f16_imm_b(
;
; VI-LABEL: select_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -400,65 +400,65 @@ define amdgpu_kernel void @select_f16_imm_c(
;
; VI-LABEL: select_f16_imm_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -514,65 +514,65 @@ define amdgpu_kernel void @select_f16_imm_d(
;
; VI-LABEL: select_f16_imm_d:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_d:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -801,28 +801,28 @@ define amdgpu_kernel void @select_v2f16_imm_a(
;
; VI-LABEL: select_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s2, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT: s_movk_i32 s6, 0x3900
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
@@ -830,36 +830,36 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
@@ -874,7 +874,7 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -942,28 +942,28 @@ define amdgpu_kernel void @select_v2f16_imm_b(
;
; VI-LABEL: select_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s2, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT: s_movk_i32 s6, 0x3900
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
@@ -971,36 +971,36 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
+; VI-NEXT: v_cmp_gt_f16_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
@@ -1015,7 +1015,7 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1083,29 +1083,29 @@ define amdgpu_kernel void @select_v2f16_imm_c(
;
; VI-LABEL: select_v2f16_imm_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1118,32 +1118,32 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -1158,7 +1158,7 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1226,29 +1226,29 @@ define amdgpu_kernel void @select_v2f16_imm_d(
;
; VI-LABEL: select_v2f16_imm_d:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1261,32 +1261,32 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_d:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -1301,7 +1301,7 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index b3f4790..232c05e 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -29,17 +29,17 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s5, s5, s7
-; VI-NEXT: s_lshl_b32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b32 s1, s1, s3
+; VI-NEXT: s_lshl_b32 s0, s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i32:
@@ -159,21 +159,21 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: shl_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16:
@@ -396,29 +396,29 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: shl_i16_computed_amount:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_add_u16_e32 v0, 3, v0
; VI-NEXT: v_lshlrev_b16_e32 v0, v0, v2
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_computed_amount:
@@ -484,14 +484,14 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) {
;
; VI-LABEL: shl_i16_i_s:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s4, s4, 12
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_lshl_b32 s0, s2, 12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_i_s:
@@ -561,26 +561,26 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4
+; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i16:
@@ -659,15 +659,15 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
@@ -770,16 +770,16 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: shl_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i64:
@@ -1041,14 +1041,14 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a
;
; VI-LABEL: s_shl_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_32_i64:
@@ -1153,18 +1153,18 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: s_shl_constant_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s9, 0xffff
-; VI-NEXT: s_mov_b32 s8, s6
-; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s8, s2
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_constant_i64:
@@ -1215,20 +1215,20 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa
;
; VI-LABEL: v_shl_constant_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, 0xab19b207
-; VI-NEXT: s_movk_i32 s1, 0x11e
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, 0xab19b207
+; VI-NEXT: s_movk_i32 s5, 0x11e
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_constant_i64:
@@ -1285,16 +1285,16 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr
;
; VI-LABEL: v_shl_i64_32_bit_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0x12d687, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0x12d687, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_i64_32_bit_constant:
@@ -1349,16 +1349,16 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: v_shl_inline_imm_64_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 64, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 64, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_inline_imm_64_i64:
@@ -1407,15 +1407,15 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: s_shl_inline_imm_64_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 64, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 64, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_64_i64:
@@ -1457,15 +1457,15 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: s_shl_inline_imm_1_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 1, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 1, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_1_i64:
@@ -1508,15 +1508,15 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_1_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 1.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_1_0_i64:
@@ -1555,15 +1555,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -1.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
@@ -1602,15 +1602,15 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_0_5_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0.5, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_0_5_i64:
@@ -1649,15 +1649,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -0.5, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
@@ -1696,15 +1696,15 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_2_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_2_0_i64:
@@ -1743,15 +1743,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
@@ -1790,15 +1790,15 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 4.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_4_0_i64:
@@ -1837,15 +1837,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -4.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
@@ -1887,15 +1887,15 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0x40800000, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 1384fb0..05948d8 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -90,26 +90,26 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
@@ -142,24 +142,24 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-LABEL: v_shl_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_shl_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -364,27 +364,27 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_imm_v_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_imm_v_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
@@ -416,24 +416,24 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_imm_v_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_imm_v_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -450,26 +450,26 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
@@ -498,24 +498,24 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_v_imm_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_v_imm_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -532,27 +532,27 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
@@ -595,26 +595,26 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-LABEL: v_shl_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_shl_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -633,27 +633,27 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
@@ -692,26 +692,26 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_v_imm_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_v_imm_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 1a55bf6..5af7dfe 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -48,15 +48,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_x_sub_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
@@ -65,16 +65,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_x_sub_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -84,35 +84,35 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_x_sub_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_x_sub_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -173,18 +173,18 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v4, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; VI-SDAG-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
@@ -196,19 +196,19 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v4, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
@@ -221,52 +221,52 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -319,15 +319,15 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_64_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 64, v3
@@ -336,16 +336,16 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_64_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -355,35 +355,35 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_64_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_64_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_64_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -432,15 +432,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_x_sub_65:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3
@@ -449,16 +449,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_x_sub_65:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -468,70 +468,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0x41, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -580,15 +580,15 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_65_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3
@@ -597,16 +597,16 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_65_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -616,35 +616,35 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_65_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_65_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_65_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -693,15 +693,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_x_sub_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 16, v3
@@ -710,16 +710,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_x_sub_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -729,70 +729,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -841,15 +841,15 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_neg16_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, -16, v3
@@ -858,16 +858,16 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_neg16_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -877,35 +877,35 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_i32_neg16_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg16_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_neg16_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, -16, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -954,15 +954,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_x_sub_neg17:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 17, v3
@@ -971,16 +971,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_x_sub_neg17:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -990,70 +990,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 17, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0xffffffef, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1102,15 +1102,15 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_neg17_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3
@@ -1119,16 +1119,16 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_neg17_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1138,35 +1138,35 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_i32_neg17_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg17_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_neg17_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1270,15 +1270,15 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v3
@@ -1287,16 +1287,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1306,35 +1306,35 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i16_x_sub_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1387,16 +1387,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-SDAG-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-SDAG-NEXT: flat_load_ushort v2, v[1:2]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v2
@@ -1405,17 +1405,17 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s7
; VI-GISEL-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; VI-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-GISEL-NEXT: flat_load_ushort v2, v[1:2]
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1425,41 +1425,41 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1521,18 +1521,18 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_ushort v4, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v3
; VI-SDAG-NEXT: v_subrev_u16_e32 v3, 64, v4
@@ -1544,19 +1544,19 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_ushort v4, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3
@@ -1569,52 +1569,52 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX11-NEXT: v_sub_nc_u16 v2, v2, 64
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v2, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1676,16 +1676,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1696,17 +1696,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1718,35 +1718,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_64_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1804,16 +1804,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, -7, v3
@@ -1824,17 +1824,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1846,48 +1846,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x400007
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x400007
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_7_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x400007
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1945,16 +1945,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff85
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1965,17 +1965,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1987,48 +1987,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x7b0040
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x7b0040
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_64_123:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2084,15 +2084,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -2103,17 +2103,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -2125,35 +2125,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_7_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2208,16 +2208,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, -16
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2227,18 +2227,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2248,35 +2248,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2330,16 +2330,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2349,18 +2349,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffc400
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2370,48 +2370,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_brev_b32 s0, 35
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_brev_b32 s2, 35
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2465,16 +2465,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xffffbc00
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2484,18 +2484,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4400
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2505,48 +2505,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_brev_b32 s0, 34
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_brev_b32 s2, 34
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x44000000
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x44000000
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2605,16 +2605,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 32
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2625,17 +2625,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffffe0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2647,35 +2647,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2729,16 +2729,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 32
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2748,18 +2748,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2769,35 +2769,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2852,15 +2852,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -2871,17 +2871,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -2893,35 +2893,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2980,16 +2980,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, -16
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, -16, v3
@@ -3000,17 +3000,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, -16
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3022,35 +3022,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3104,16 +3104,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, -16
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3123,18 +3123,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, -16
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3144,35 +3144,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3227,15 +3227,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -3246,17 +3246,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -3268,35 +3268,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3354,16 +3354,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc400
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0xc400, v3
@@ -3374,17 +3374,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc400
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3396,72 +3396,72 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_movk_i32 s0, 0xc400
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_movk_i32 s2, 0xc400
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3519,16 +3519,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0x4400, v3
@@ -3539,17 +3539,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3561,72 +3561,72 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_movk_i32 s0, 0x4400
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_movk_i32 s2, 0x4400
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3684,16 +3684,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0x4000, v3
@@ -3704,17 +3704,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3726,35 +3726,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3812,16 +3812,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0xc000, v3
@@ -3832,17 +3832,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3854,35 +3854,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3935,15 +3935,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 32
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3953,19 +3953,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3975,35 +3975,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4054,15 +4054,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 32, v3
@@ -4071,19 +4071,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
+; VI-GISEL-NEXT: s_lshl_b32 s0, s0, 16
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: s_and_b32 s2, 0xffff, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: s_lshl_b32 s0, s2, 16
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffe0, v3
@@ -4093,71 +4093,71 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 1ab6376..3dcdfeb 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -45,12 +45,12 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out,
; FLAT-NEXT: s_cbranch_execnz .LBB0_1
; FLAT-NEXT: ; %bb.2: ; %ENDLOOP
; FLAT-NEXT: s_or_b64 exec, exec, s[2:3]
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: v_mov_b32_e32 v0, 0
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
main_body:
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 33249e4..1aa3da9 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -130,15 +130,15 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun
;
; VI-LABEL: s_sext_i32_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s5, s4, 31
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s0, s2, 31
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i32 %a to i64
store i64 %sext, ptr addrspace(1) %out, align 8
@@ -166,20 +166,20 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_sext_i32_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%sext = sext i32 %val to i64
@@ -203,15 +203,15 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun
;
; VI-LABEL: s_sext_i16_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i16 %a to i64
store i64 %sext, ptr addrspace(1) %out, align 8
@@ -276,17 +276,17 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32
; VI-LABEL: s_sext_i1_to_i16_with_and:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s4, s5
-; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_cmp_eq_u32 s6, s7
-; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
-; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%cmp0 = icmp eq i32 %a, %b
%cmp1 = icmp eq i32 %c, %d
@@ -375,26 +375,26 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n
;
; VI-LABEL: s_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4
-; VI-NEXT: s_ashr_i32 s5, s4, 24
-; VI-NEXT: s_bfe_i32 s6, s4, 0x80010
-; VI-NEXT: s_sext_i32_i8 s4, s4
+; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2
+; VI-NEXT: s_ashr_i32 s0, s2, 24
+; VI-NEXT: s_bfe_i32 s1, s2, 0x80010
+; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%cast = bitcast i32 %a to <4 x i8>
@@ -443,30 +443,30 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: v_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in
@@ -513,27 +513,27 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a)
;
; VI-LABEL: s_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_ashr_i32 s1, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_ashr_i32 s0, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_ashr_i32 s5, s6, 16
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s4, s7, 16
+; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%cast = bitcast i64 %a to <4 x i16>
@@ -580,29 +580,29 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: v_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 539cfc7..4770a35 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -42,25 +42,25 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;
; GFX9-LABEL: test_simple_indirect_call:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX9-NEXT: s_add_u32 s0, s0, s15
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
+; GFX9-NEXT: s_lshr_b32 s4, s6, 16
+; GFX9-NEXT: s_mul_i32 s4, s4, s7
; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0
-; GFX9-NEXT: s_getpc_b64 s[6:7]
-; GFX9-NEXT: s_add_u32 s6, s6, indirect@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s7, s7, indirect@rel32@hi+12
-; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s7
-; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, indirect@rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, indirect@rel32@hi+12
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: v_mad_u32_u24 v0, v1, s7, v0
; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b64 v0, v[3:4]
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
%fptr = alloca ptr, addrspace(5)
%fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index b037268..ba0f4c8 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -32,50 +32,50 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s5, s2, s3
-; GFX8-NEXT: s_flbit_i32 s4, s3
-; GFX8-NEXT: s_ashr_i32 s5, s5, 31
-; GFX8-NEXT: s_add_i32 s4, s4, -1
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s4, s4, s5
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s2, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_ldexp_f32 v0, v0, s0
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_sint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s4, s2, s3
-; GFX11-NEXT: s_cls_i32 s5, s3
-; GFX11-NEXT: s_ashr_i32 s4, s4, 31
-; GFX11-NEXT: s_add_i32 s5, s5, -1
-; GFX11-NEXT: s_add_i32 s4, s4, 32
+; GFX11-NEXT: s_xor_b32 s0, s6, s7
+; GFX11-NEXT: s_cls_i32 s1, s7
+; GFX11-NEXT: s_ashr_i32 s0, s0, 31
+; GFX11-NEXT: s_add_i32 s1, s1, -1
+; GFX11-NEXT: s_add_i32 s0, s0, 32
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s5, s4
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s1, s0
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -116,12 +116,12 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -136,8 +136,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
@@ -146,11 +146,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_sint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
; GFX11-NEXT: v_cls_i32_e32 v4, v2
@@ -170,7 +170,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -209,47 +209,47 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_sint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s5, s2, s3
-; GFX8-NEXT: s_flbit_i32 s4, s3
-; GFX8-NEXT: s_ashr_i32 s5, s5, 31
-; GFX8-NEXT: s_add_i32 s4, s4, -1
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s4, s4, s5
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_sub_i32 s0, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s2, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_sint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s4, s2, s3
-; GFX11-NEXT: s_cls_i32 s5, s3
-; GFX11-NEXT: s_ashr_i32 s4, s4, 31
-; GFX11-NEXT: s_add_i32 s5, s5, -1
-; GFX11-NEXT: s_add_i32 s4, s4, 32
+; GFX11-NEXT: s_xor_b32 s0, s6, s7
+; GFX11-NEXT: s_cls_i32 s1, s7
+; GFX11-NEXT: s_ashr_i32 s0, s0, 31
+; GFX11-NEXT: s_add_i32 s1, s1, -1
+; GFX11-NEXT: s_add_i32 s0, s0, 32
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s5, s4
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s1, s0
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -289,12 +289,12 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_sint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -305,11 +305,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_min_u32_e32 v4, v4, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
@@ -318,11 +318,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_sint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
; GFX11-NEXT: v_cls_i32_e32 v4, v2
@@ -341,7 +341,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -392,34 +392,34 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s3, s6, s7
-; GFX8-NEXT: s_flbit_i32 s2, s7
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s9, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_xor_b32 s2, s4, s5
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s9, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s9
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_xor_b32 s0, s4, s5
; GFX8-NEXT: s_flbit_i32 s8, s5
-; GFX8-NEXT: s_ashr_i32 s2, s2, 31
+; GFX8-NEXT: s_ashr_i32 s0, s0, 31
; GFX8-NEXT: s_add_i32 s8, s8, -1
-; GFX8-NEXT: s_add_i32 s2, s2, 32
-; GFX8-NEXT: s_min_u32 s6, s8, s2
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
-; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_i32 s0, s0, 32
+; GFX8-NEXT: s_min_u32 s6, s8, s0
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s6
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s6
+; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -427,35 +427,35 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s6, s7
+; GFX11-NEXT: s_xor_b32 s1, s6, s7
; GFX11-NEXT: s_xor_b32 s9, s4, s5
-; GFX11-NEXT: s_cls_i32 s2, s7
+; GFX11-NEXT: s_cls_i32 s0, s7
; GFX11-NEXT: s_cls_i32 s8, s5
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
+; GFX11-NEXT: s_ashr_i32 s1, s1, 31
; GFX11-NEXT: s_ashr_i32 s9, s9, 31
-; GFX11-NEXT: s_add_i32 s2, s2, -1
+; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_i32 s8, s8, -1
-; GFX11-NEXT: s_add_i32 s3, s3, 32
+; GFX11-NEXT: s_add_i32 s1, s1, 32
; GFX11-NEXT: s_add_i32 s9, s9, 32
-; GFX11-NEXT: s_min_u32 s10, s2, s3
+; GFX11-NEXT: s_min_u32 s10, s0, s1
; GFX11-NEXT: s_min_u32 s8, s8, s9
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s10
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s10
-; GFX11-NEXT: s_sub_i32 s3, 32, s8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s10
+; GFX11-NEXT: s_sub_i32 s1, 32, s8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v0, v2, s1
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,19 +534,19 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
@@ -603,12 +603,12 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
@@ -664,7 +664,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
-; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v7, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -719,37 +719,37 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s3, s6, s7
-; GFX8-NEXT: s_flbit_i32 s2, s7
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s8, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: s_xor_b32 s3, s4, s5
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_flbit_i32 s2, s5
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s7, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s8, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_xor_b32 s1, s4, s5
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_flbit_i32 s0, s5
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s7, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s7
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s0
; GFX8-NEXT: s_sub_i32 s6, 32, s8
-; GFX8-NEXT: s_sub_i32 s2, 32, s7
+; GFX8-NEXT: s_sub_i32 s0, 32, s7
; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
-; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
+; GFX8-NEXT: v_ldexp_f32 v1, v1, s0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -757,40 +757,40 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s6, s7
+; GFX11-NEXT: s_xor_b32 s1, s6, s7
; GFX11-NEXT: s_xor_b32 s9, s4, s5
-; GFX11-NEXT: s_cls_i32 s2, s7
+; GFX11-NEXT: s_cls_i32 s0, s7
; GFX11-NEXT: s_cls_i32 s8, s5
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
+; GFX11-NEXT: s_ashr_i32 s1, s1, 31
; GFX11-NEXT: s_ashr_i32 s9, s9, 31
-; GFX11-NEXT: s_add_i32 s2, s2, -1
+; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_i32 s8, s8, -1
-; GFX11-NEXT: s_add_i32 s3, s3, 32
+; GFX11-NEXT: s_add_i32 s1, s1, 32
; GFX11-NEXT: s_add_i32 s9, s9, 32
-; GFX11-NEXT: s_min_u32 s10, s2, s3
+; GFX11-NEXT: s_min_u32 s10, s0, s1
; GFX11-NEXT: s_min_u32 s8, s8, s9
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s10
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s10
-; GFX11-NEXT: s_sub_i32 s3, 32, s8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s10
+; GFX11-NEXT: s_sub_i32 s1, 32, s8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v1, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -877,18 +877,18 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
@@ -943,7 +943,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
@@ -952,12 +952,12 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
@@ -1022,7 +1022,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index b4b0d96..8384576 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
;
; VI-LABEL: sitofp_i16_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_i16_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @sitofp_i32_to_f16(
;
; VI-LABEL: sitofp_i32_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_i32_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,44 +168,44 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; VI-LABEL: sitofp_v2i16_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_i16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_v2i16_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_i16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,39 +244,39 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
;
; VI-LABEL: sitofp_v2i32_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_i32_e32 v1, v1
; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_v2i32_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -285,7 +285,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,20 +357,19 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -378,9 +377,10 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index e1bd152..e585e6c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -4988,1318 +4988,1317 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-LABEL: test:
; GFX9-FLATSCR: ; %bb.0: ; %entry
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
-; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x80
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x80
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s6, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s7
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 20
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 20
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 36
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 36
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 52
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 52
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x44
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x54
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x64
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x74
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x100
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x100
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x84
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x94
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x180
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x180
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x114
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x124
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x134
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x154
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x164
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x174
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x200
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x200
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x194
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1f4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x280
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x280
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x204
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x214
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x234
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x244
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x254
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x274
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x300
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x300
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x284
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x294
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2f4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x380
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x380
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x314
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x324
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x334
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x354
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x364
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x374
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x400
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x400
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x394
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x404
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7]
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x404
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s5
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x414
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:16
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x414
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x424
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:32
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x424
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x434
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:48
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x434
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x444
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:64
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x444
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x454
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:80
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x454
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x464
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:96
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x464
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x474
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:112
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x474
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x484
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:128
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x484
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x494
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x494
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:160
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:176
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:192
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:208
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:240
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x504
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:256
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x504
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x514
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:272
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x514
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x524
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:288
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x524
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x534
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x534
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x544
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:320
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x544
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x554
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:336
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x554
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x564
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:352
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x564
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x574
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:368
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x574
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x584
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x584
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x594
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:400
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x594
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:416
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:432
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:448
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:464
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:480
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:496
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x604
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:512
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x604
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x614
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:528
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x614
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x624
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:544
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x624
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x634
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:560
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x634
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x644
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:576
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x644
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x654
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:592
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x654
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x664
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:608
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x664
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x674
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:624
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x674
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x684
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:640
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x684
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x694
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:656
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x694
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:672
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:688
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:704
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:720
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:736
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:752
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x704
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:768
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x704
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x714
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:784
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x714
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x724
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:800
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x724
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x734
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:816
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x734
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x744
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:832
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x744
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x754
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:848
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x754
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x764
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:864
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x764
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x774
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:880
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x774
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x784
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:896
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x784
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x794
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:912
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x794
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:928
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:944
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:960
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:976
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:992
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1008
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x804
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1024
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x814
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1040
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x824
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1056
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x834
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1072
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x844
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1088
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x854
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x864
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1120
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x874
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1136
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x884
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1152
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x894
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1168
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1200
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1216
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1232
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1248
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x904
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1280
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x914
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1296
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x924
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1312
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x934
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1328
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x944
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x954
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1360
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x964
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1376
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x974
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1392
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x984
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1408
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x994
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1424
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1440
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1456
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1472
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1488
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1504
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1520
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1536
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1552
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1568
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1584
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1600
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1616
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1632
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1648
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1664
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1680
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1696
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xab4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1712
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xac4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1728
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xad4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1744
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xae4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1760
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1776
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1792
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1808
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1824
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1840
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1856
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1872
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1888
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1904
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1920
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1936
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xba4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1952
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1968
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1984
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2000
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2016
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2032
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2048
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2064
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2080
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2096
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2112
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2128
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2160
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2176
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2192
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xca4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2208
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2240
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2256
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xce4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2272
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2288
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2320
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2336
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2352
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2368
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2400
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2416
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2432
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2448
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xda4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2464
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2480
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2496
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2512
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xde4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2528
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2544
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2560
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2576
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2592
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2608
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2624
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2640
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2656
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2672
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2688
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2704
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xea4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2720
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2736
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xec4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2752
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xed4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2768
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xee4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2784
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xef4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2800
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2816
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2832
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2848
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2864
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2880
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2896
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2912
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2928
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2944
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2960
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2976
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2992
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3008
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3024
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3040
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xff4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3056
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1004
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3072
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1014
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3088
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1024
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1034
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3120
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1044
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3136
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1054
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3152
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1064
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3168
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1074
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1084
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3200
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1094
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3216
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3232
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3248
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3280
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3296
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3312
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1104
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3328
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1114
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1124
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3360
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1134
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3376
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1144
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3392
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1154
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3408
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1164
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3424
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1174
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3440
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1184
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3456
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1194
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3472
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3488
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3504
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3520
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3536
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3552
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3568
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1204
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3584
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1214
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3600
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1224
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3616
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1234
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3632
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1244
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3648
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1254
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3664
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1264
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3680
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1274
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3696
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1284
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3712
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1294
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3728
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3744
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3760
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3776
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3792
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3808
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3824
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3840
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1314
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3856
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1324
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3872
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1334
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3888
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1344
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3904
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1354
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3920
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1364
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3936
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1374
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3952
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1384
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3968
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1394
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3984
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4000
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4016
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4032
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4048
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4064
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4080
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
@@ -6314,1035 +6313,1036 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s0, v5
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s4, v5
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4080
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13d4
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4064
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4080
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4048
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4064
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4032
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4048
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4016
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1394
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4032
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4000
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1384
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4016
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3984
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1374
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4000
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3968
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1364
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3984
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3952
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1354
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3968
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3936
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1344
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3952
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3920
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1334
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3936
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3904
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1324
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3920
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3888
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1314
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3904
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3872
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1304
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3888
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3856
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3872
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3840
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3856
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3824
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3840
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3808
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3824
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3792
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3808
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3776
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3792
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3760
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1294
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3776
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3744
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1284
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3760
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3728
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1274
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3744
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3712
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1264
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3728
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3696
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1254
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3712
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3680
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1244
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3696
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3664
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1234
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3680
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3648
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1224
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3664
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3632
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1214
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3648
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3616
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1204
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3632
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3600
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3616
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3584
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3600
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3568
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3584
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3552
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3568
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3536
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3552
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3520
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3536
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3504
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1194
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3520
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3488
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1184
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3504
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3472
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1174
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3488
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3456
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1164
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3472
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3440
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1154
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3456
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3424
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1144
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3440
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3408
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1134
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3424
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3392
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1124
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3408
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3376
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1114
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3392
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3360
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1104
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3376
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3344
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3360
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3328
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3344
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3312
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3328
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3296
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3312
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3280
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3296
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3264
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3280
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3248
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1094
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3264
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3232
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1084
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3248
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3216
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1074
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3232
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3200
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1064
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3216
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3184
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1054
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3200
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3168
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1044
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3184
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3152
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1034
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3168
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3136
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1024
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3152
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3120
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1014
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3136
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3104
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1004
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3120
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3088
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xff4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3104
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3072
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfe4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3088
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3056
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3072
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3040
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3056
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3024
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3040
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3008
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfa4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3024
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2992
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3008
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2976
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2992
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2960
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2976
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2944
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2960
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2928
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2944
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2912
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2928
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2896
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2912
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2880
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2896
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2864
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2880
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2848
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2864
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2832
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xef4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2848
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2816
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xee4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2832
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2800
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xed4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2816
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2784
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xec4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2800
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2768
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xeb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2784
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2752
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xea4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2768
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2736
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2752
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2720
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2736
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2704
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2720
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2688
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2704
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2672
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2688
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2656
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2672
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2640
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2656
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2624
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2640
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2608
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2624
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2592
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2608
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2576
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2592
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2560
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xde4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2576
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2544
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2560
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2528
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2544
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2512
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2528
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2496
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xda4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2512
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2480
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2496
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2464
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2480
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2448
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2464
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2432
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2448
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2416
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2432
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2400
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2416
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2384
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2400
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2368
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2384
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2352
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2368
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2336
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2352
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2320
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2336
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2304
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xce4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2320
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2288
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2304
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2272
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2288
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2256
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2272
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2240
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xca4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2256
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2240
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2208
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2192
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2208
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2192
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2160
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2176
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2144
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2160
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2128
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2144
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2128
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2096
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2080
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2096
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2064
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2080
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2048
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbe4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2064
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2048
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2032
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2016
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xba4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2000
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1984
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1968
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1952
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1936
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1920
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1904
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1888
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1872
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1856
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1840
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xaf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1824
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xae4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1808
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xad4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1792
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xac4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1776
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xab4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1760
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xaa4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1744
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1728
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1712
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1696
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1680
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1664
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1648
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1632
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1616
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1600
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1584
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1568
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1552
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1536
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1520
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1504
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1488
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x994
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1472
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x984
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1456
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x974
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1440
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x964
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1424
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x954
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1408
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x944
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1392
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x934
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1376
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x924
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1360
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x914
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1344
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x904
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1328
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1312
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1296
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1280
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1264
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1248
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1232
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x894
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1216
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x884
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1200
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x874
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1184
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x864
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1168
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x854
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1152
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x844
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1136
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x834
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1120
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x824
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1104
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x814
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1088
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x804
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1072
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1056
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1040
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1024
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1008
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:992
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:976
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x794
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:960
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x784
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:944
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x794
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x774
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:928
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x784
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x764
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:912
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x774
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x754
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:896
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x764
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x744
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:880
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x754
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x734
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:864
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x744
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x724
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:848
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x734
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x714
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:832
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x724
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x704
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:816
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x714
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:800
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x704
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:784
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:768
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:752
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:736
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:720
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x694
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x684
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:688
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x694
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x674
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:672
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x684
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x664
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:656
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x674
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x654
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:640
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x664
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x644
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:624
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x654
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x634
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:608
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x644
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x624
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:592
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x634
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x614
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:576
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x624
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x604
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:560
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x614
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:544
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x604
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:528
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:512
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:496
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:480
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:464
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x594
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:448
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x584
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:432
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x594
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x574
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:416
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x584
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x564
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:400
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x574
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x554
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:384
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x564
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x544
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:368
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x554
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x534
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:352
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x544
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x524
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:336
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x534
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x514
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:320
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x524
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x504
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:304
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x514
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:288
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x504
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:272
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:256
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:240
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:208
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x494
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:192
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x484
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:176
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x494
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x474
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:160
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x484
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x464
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:144
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x474
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x454
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:128
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x464
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x444
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x454
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x434
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:96
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x444
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x424
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:80
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x434
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x414
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:64
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x424
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x404
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:48
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x414
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:32
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x404
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:16
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX9-FLATSCR-NEXT: s_nop 0
; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x400, v4
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:4080
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3d4
@@ -7617,14 +7617,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0
; GFX10-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s4, s2, v5
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, s3, 0, s4
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x804
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s0, s6, v5
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, s7, 0, s0
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x80, v4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v4
@@ -8045,795 +8045,795 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:2036 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x814
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x824
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x834
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x844
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x854
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x864
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x874
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v6
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v7, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x884
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x894
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v8
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x904
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x914
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x924
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x934
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x944
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x954
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x964
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x974
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v10
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v11, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x984
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x994
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v12
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v13, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa04
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa14
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa24
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa34
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa44
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa54
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v14
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa84
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa94
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xab4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xac4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xad4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xae4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v16
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v17, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb04
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb14
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb24
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb34
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb44
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb54
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v18
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v19, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb84
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb94
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xba4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7]
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:16
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:32
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:48
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:80
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:96
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:112
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:128
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:144
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xca4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:160
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:176
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:192
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:208
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xce4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:224
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:240
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:256
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:272
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:288
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:304
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:320
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:336
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:352
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:368
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:384
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:400
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xda4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:416
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:432
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:448
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:464
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xde4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:480
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:496
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:512
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:528
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:544
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:560
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:576
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:592
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:608
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:624
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:640
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:656
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xea4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:672
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:688
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xec4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:704
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xed4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:720
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xee4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:736
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xef4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:752
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:768
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:784
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:800
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:816
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:832
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:848
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:864
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:880
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:896
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:912
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:928
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:944
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:960
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:976
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:992
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xff4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1008
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1004
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1024
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1014
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1040
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1024
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1056
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1034
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1072
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1044
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1088
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1054
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1104
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1064
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1120
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1074
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1136
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1084
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1152
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1094
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1168
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1184
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1200
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1216
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1232
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1248
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1264
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1104
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1280
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1114
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1296
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1124
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1312
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1134
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1328
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1144
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1344
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1154
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1360
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1164
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1376
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1174
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1392
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1184
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1408
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1194
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1424
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1440
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1456
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1472
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1488
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1504
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1520
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1204
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1536
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1214
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1552
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1568
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1234
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1584
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1244
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1600
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1254
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1616
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1264
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1632
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1274
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1648
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1284
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1664
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1294
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1680
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1696
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1712
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1728
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1744
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1760
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1776
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1792
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1314
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1808
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1324
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1824
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1334
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1840
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1344
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1856
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1354
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1872
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1364
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1888
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1374
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1904
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1384
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1920
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1394
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1936
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1952
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1968
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1984
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2000
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2016
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2032
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s2, s0, v5
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s2
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s0, s4, v5
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v6, null, s5, 0, s0
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
@@ -8847,520 +8847,520 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13d4
-; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2032
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2016
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2000
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1394
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1984
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1384
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1968
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1374
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1952
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1364
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1936
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1354
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1920
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1344
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1904
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1334
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1888
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1324
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1872
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1314
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1856
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1304
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1840
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1824
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1808
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1792
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1776
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1760
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1744
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1294
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1728
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1284
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1712
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1274
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1696
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1264
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1680
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1254
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1664
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1244
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1648
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1632
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1224
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1616
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1214
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1600
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1204
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1584
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1568
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1552
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1536
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1520
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1504
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1488
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1194
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1472
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1184
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1456
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1174
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1440
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1164
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1424
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1154
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1408
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1144
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1392
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1134
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1376
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1124
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1360
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1114
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1344
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1104
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1328
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1312
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1296
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1280
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1264
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1248
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1232
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1094
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1216
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1084
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1200
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1074
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1184
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1064
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1168
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1054
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1152
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1044
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1136
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1034
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1120
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1024
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1104
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1014
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1088
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1004
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1072
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xff4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1056
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfe4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1040
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1024
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1008
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:992
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfa4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:976
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:960
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:944
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:928
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:912
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:896
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:880
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:864
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:848
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:832
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:816
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xef4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:800
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xee4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:784
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xed4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:768
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xec4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:752
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xeb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:736
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xea4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:720
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:688
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:672
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:656
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:640
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:624
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:608
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:592
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:576
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:560
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdf4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:544
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xde4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:528
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:512
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:496
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:480
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xda4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:464
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:448
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:432
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:416
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:400
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:384
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:368
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:352
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:336
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:320
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:304
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcf4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:288
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xce4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:272
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:256
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:240
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:224
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xca4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:208
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:192
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:176
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:160
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:144
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:128
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:112
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:96
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:80
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:64
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:48
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:32
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:16
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x480, v4
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x480, v4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x780, v0
-; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[2:3], v[7:10], off offset:2032
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index b8cf692..64277e8 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -27,21 +27,21 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1
; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i32:
@@ -94,24 +94,24 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3
; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2
; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1
; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i32:
@@ -175,31 +175,31 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v1
-; VI-NEXT: v_readfirstlane_b32 s1, v0
-; VI-NEXT: s_ashr_i32 s2, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s3, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s1, s0
-; VI-NEXT: s_ashr_i32 s1, s2, s3
-; VI-NEXT: s_lshl_b32 s1, s1, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s0, s0, s1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: v_readfirstlane_b32 s5, v0
+; VI-NEXT: s_ashr_i32 s6, s5, 16
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_ashr_i32 s7, s4, 16
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_ashr_i32 s4, s5, s4
+; VI-NEXT: s_ashr_i32 s5, s6, s7
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i16:
@@ -282,43 +282,43 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v2
-; VI-NEXT: v_readfirstlane_b32 s1, v3
-; VI-NEXT: v_readfirstlane_b32 s2, v0
-; VI-NEXT: v_readfirstlane_b32 s3, v1
-; VI-NEXT: s_ashr_i32 s8, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_ashr_i32 s9, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_ashr_i32 s10, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s11, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s2, s0
-; VI-NEXT: s_ashr_i32 s2, s9, s11
-; VI-NEXT: s_ashr_i32 s1, s3, s1
-; VI-NEXT: s_ashr_i32 s3, s8, s10
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s1, s1, s3
-; VI-NEXT: s_or_b32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_readfirstlane_b32 s4, v2
+; VI-NEXT: v_readfirstlane_b32 s5, v3
+; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: s_ashr_i32 s8, s7, 16
+; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: s_ashr_i32 s9, s6, 16
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_ashr_i32 s10, s5, 16
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_ashr_i32 s11, s4, 16
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_ashr_i32 s4, s6, s4
+; VI-NEXT: s_ashr_i32 s6, s9, s11
+; VI-NEXT: s_ashr_i32 s5, s7, s5
+; VI-NEXT: s_ashr_i32 s7, s8, s10
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i16:
@@ -409,16 +409,16 @@ define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_ashr_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s5, s4, 31
-; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s3, s2, 31
+; VI-NEXT: s_ashr_i64 s[0:1], s[2:3], 8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_i64:
@@ -461,20 +461,20 @@ define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_i64_2:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_i64_2:
@@ -533,22 +533,22 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i64:
@@ -730,18 +730,18 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s6, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s8, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s6, 31
-; VI-NEXT: s_add_u32 s4, s6, s4
-; VI-NEXT: s_addc_u32 s5, s7, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s1, s8, 31
+; VI-NEXT: s_add_u32 s0, s8, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_32_i64:
@@ -785,17 +785,17 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_ashr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -849,18 +849,18 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_63_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s6, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s8, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s6, s6, 31
-; VI-NEXT: s_add_u32 s4, s6, s4
-; VI-NEXT: s_addc_u32 s5, s6, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s1, s8, 31
+; VI-NEXT: s_add_u32 s0, s1, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_63_i64:
@@ -905,17 +905,17 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_ashr_63_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 418c160..d33723c 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -26,15 +26,15 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: lshr_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_i32:
@@ -83,17 +83,17 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: lshr_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s5, s7
-; VI-NEXT: s_lshr_b32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s1, s1, s3
+; VI-NEXT: s_lshr_b32 s0, s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_v2i32:
@@ -212,16 +212,16 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: lshr_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_i64:
@@ -382,14 +382,14 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_lshr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_lshr_32_i64:
@@ -428,17 +428,17 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_lshr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 45aa544..6175d49 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -72,13 +72,13 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
;
; GFX8-LABEL: s_sub_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sub_i32 s0, 0x4d2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -130,13 +130,13 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: test_sub_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: flat_store_dword v[2:3], v0
@@ -144,24 +144,24 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: test_sub_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -194,13 +194,13 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: test_sub_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7b, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -208,24 +208,24 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX9-LABEL: test_sub_imm_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0x7b, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_imm_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0x7b, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -257,13 +257,13 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
@@ -272,26 +272,26 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v3
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v2
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -328,18 +328,18 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s2, 16
-; GFX8-NEXT: s_addc_u32 s3, s3, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_add_u32 s0, s6, 16
+; GFX8-NEXT: s_addc_u32 s1, s7, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
@@ -350,33 +350,33 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v4i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2
; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0
-; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
-; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v8, s[6:7] offset:16
+; GFX12-NEXT: global_load_b128 v[4:7], v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -412,11 +412,11 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: test_sub_i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -424,38 +424,38 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sub_u16_e32 v2, v4, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: test_sub_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u16 v0, v0, s[6:7] offset:2 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u16 v0, v1, v0
-; GFX12-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -497,15 +497,15 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v4, v0, v1
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -515,26 +515,26 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -583,15 +583,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v6, v1, v3
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -604,28 +604,28 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_sub_i16 v1, v1, v3
; GFX12-NEXT: v_pk_sub_i16 v0, v0, v2
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -657,14 +657,14 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX8-LABEL: s_sub_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_u32 s2, s4, s6
-; GFX8-NEXT: s_subb_u32 s3, s5, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_sub_u32 s0, s4, s6
+; GFX8-NEXT: s_subb_u32 s1, s5, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -685,12 +685,12 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7]
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[4:5], s[6:7]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -726,14 +726,14 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX8-LABEL: v_sub_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -764,13 +764,13 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX12-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
@@ -817,14 +817,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: v_test_sub_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -859,13 +859,13 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7]
-; GFX12-NEXT: global_load_b128 v[4:7], v4, s[0:1]
+; GFX12-NEXT: global_load_b128 v[4:7], v4, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
@@ -922,14 +922,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: v_test_sub_v4i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
@@ -988,15 +988,15 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7]
-; GFX12-NEXT: global_load_b128 v[4:7], v12, s[0:1]
+; GFX12-NEXT: global_load_b128 v[4:7], v12, s[2:3]
; GFX12-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:16
-; GFX12-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:16
+; GFX12-NEXT: global_load_b128 v[12:15], v12, s[2:3] offset:16
; GFX12-NEXT: s_wait_loadcnt 0x2
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6c53afe..c1e7256 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -25,14 +25,14 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: v_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -114,12 +114,12 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: s_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s6, s[6:7], 0x0
-; VI-NEXT: s_load_dword s7, s[0:1], 0x0
+; VI-NEXT: s_load_dword s7, s[8:9], 0x0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -153,14 +153,14 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0
+; GFX11-NEXT: v_pk_sub_i16 v0, s0, s1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -175,32 +175,32 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
; GCN-LABEL: s_test_sub_self_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_sub_self_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_sub_self_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -277,62 +277,62 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
+; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -349,62 +349,62 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
+; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0x3df
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -420,61 +420,61 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 1, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -490,60 +490,60 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -560,60 +560,60 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -649,14 +649,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v1, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -693,12 +693,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -747,14 +747,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -795,12 +795,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -848,14 +848,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -948,14 +948,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@@ -998,12 +998,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 7dce633..8486fba 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -12,11 +12,11 @@ declare void @llvm.debugtrap() #1
define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: trap:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
;
@@ -103,16 +103,16 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; NOHSA-TRAP-GFX900-LABEL: non_entry_trap:
; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[2:3] glc
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; NOHSA-TRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2
; NOHSA-TRAP-GFX900-NEXT: ; %bb.1: ; %ret
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
; NOHSA-TRAP-GFX900-NEXT: .LBB1_2: ; %trap
@@ -267,14 +267,14 @@ ret:
define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[4:5] glc
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2
; NOHSA-TRAP-GFX900-NEXT: ; %bb.1:
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[6:7]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: .LBB2_2:
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
@@ -403,14 +403,14 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: debugtrap:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index c0c56eb..b6056f6 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -104,15 +104,15 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc
; VI-LABEL: truncate_high_elt_extract_vector:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[4:5], 0x0
-; VI-NEXT: s_load_dword s3, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[4:5], 0x0
+; VI-NEXT: s_load_dword s1, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s0, s2
-; VI-NEXT: s_sext_i32_i16 s1, s3
+; VI-NEXT: s_sext_i32_i16 s0, s0
+; VI-NEXT: s_sext_i32_i16 s1, s1
; VI-NEXT: s_mul_i32 s1, s1, s0
; VI-NEXT: s_lshr_b32 s0, s1, 16
; VI-NEXT: v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index e668c1d..d9e0e02 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0
; CHECK-NEXT: $sgpr0 = COPY killed [[COPY3]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1
; CHECK-NEXT: $sgpr1 = COPY killed [[COPY4]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: $sgpr2 = COPY killed [[COPY5]]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 416dbb2..eb45776 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -28,12 +28,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-LABEL: s_uaddo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s6, s0
+; VI-NEXT: s_add_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s7, s1
+; VI-NEXT: s_addc_u32 s1, s7, s3
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -96,12 +96,12 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_uaddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -161,18 +161,18 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -243,18 +243,18 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_uaddo_i32_novcc:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_uaddo_i32_novcc:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -325,19 +325,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_uaddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_uaddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
@@ -401,18 +401,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,18 +486,18 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ushort v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_lt_u16_e32 vcc, v5, v4
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_lt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -568,18 +568,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_uaddo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_uaddo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index c7952f5..8e75127 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -44,17 +44,17 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: udiv_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v2, v1
; VI-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
@@ -75,7 +75,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32:
@@ -401,17 +401,17 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: udiv_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s2, s6
+; VI-NEXT: s_mov_b32 s3, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s10
+; VI-NEXT: s_mov_b32 s1, s11
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s4, s8
+; VI-NEXT: s_mov_b32 s5, s9
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v4, v2
; VI-NEXT: v_cvt_f32_u32_e32 v5, v3
@@ -714,18 +714,18 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: udiv_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_u32_e32 v8, v0
; VI-NEXT: v_cvt_f32_u32_e32 v10, v1
@@ -1116,20 +1116,20 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: udiv_i32_div_pow2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 4, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_pow2:
@@ -1203,22 +1203,22 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
;
; VI-LABEL: udiv_i32_div_k_even:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, 0xfabbd9c1
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_hi_u32 v0, v0, s2
+; VI-NEXT: v_mul_hi_u32 v0, v0, s0
+; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 25, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_k_even:
@@ -1297,22 +1297,22 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
;
; VI-LABEL: udiv_i32_div_k_odd:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0x7d5deca3
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, 0x7d5deca3
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_hi_u32 v0, v0, s2
+; VI-NEXT: v_mul_hi_u32 v0, v0, s0
+; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_k_odd:
@@ -1400,18 +1400,18 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: v_udiv_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0
@@ -1424,7 +1424,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i8:
@@ -1540,18 +1540,18 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1564,7 +1564,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i16:
@@ -1688,20 +1688,20 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i23:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -1720,7 +1720,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i23:
@@ -1885,20 +1885,20 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i24:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -1917,7 +1917,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i24:
@@ -2076,30 +2076,30 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
;
; VI-LABEL: scalarize_mulhu_4xi32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s0, 0x1389c755
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s4, 0x1389c755
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
-; VI-NEXT: v_mul_hi_u32 v0, v0, s0
-; VI-NEXT: v_mul_hi_u32 v1, v1, s0
-; VI-NEXT: v_mul_hi_u32 v2, v2, s0
-; VI-NEXT: v_mul_hi_u32 v3, v3, s0
+; VI-NEXT: v_mul_hi_u32 v0, v0, s4
+; VI-NEXT: v_mul_hi_u32 v1, v1, s4
+; VI-NEXT: v_mul_hi_u32 v2, v2, s4
+; VI-NEXT: v_mul_hi_u32 v3, v3, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: scalarize_mulhu_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index f0f0b66..0bb2127 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -77,37 +77,37 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a
;
; GFX8-LABEL: test_udivrem:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x98
-; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98
+; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX8-NEXT: s_sub_i32 s2, 0, s4
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX8-NEXT: s_sub_i32 s2, 0, s6
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0
+; GFX8-NEXT: v_mul_hi_u32 v4, s7, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_readfirstlane_b32 s0, v4
-; GFX8-NEXT: s_mul_i32 s0, s0, s4
-; GFX8-NEXT: s_sub_i32 s0, s5, s0
-; GFX8-NEXT: s_sub_i32 s1, s0, s4
+; GFX8-NEXT: s_mul_i32 s0, s0, s6
+; GFX8-NEXT: s_sub_i32 s0, s7, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4
-; GFX8-NEXT: s_cmp_ge_u32 s0, s4
+; GFX8-NEXT: s_cmp_ge_u32 s0, s6
; GFX8-NEXT: s_cselect_b64 vcc, -1, 0
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT: s_sub_i32 s1, s0, s4
+; GFX8-NEXT: s_sub_i32 s1, s0, s6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4
-; GFX8-NEXT: s_cmp_ge_u32 s0, s4
+; GFX8-NEXT: s_cmp_ge_u32 s0, s6
; GFX8-NEXT: s_cselect_b64 vcc, -1, 0
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
@@ -212,7 +212,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-LABEL: test_udivrem_v2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX8-NEXT: s_sub_i32 s2, 0, s6
@@ -227,7 +226,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: s_mul_i32 s2, s2, s6
; GFX8-NEXT: s_sub_i32 s2, s4, s2
@@ -236,24 +234,27 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: s_cselect_b32 s2, s3, s2
; GFX8-NEXT: s_sub_i32 s3, s2, s6
; GFX8-NEXT: s_cmp_ge_u32 s2, s6
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, 0, s7
-; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_cselect_b32 s4, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s7
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: s_mul_i32 s2, s2, s7
-; GFX8-NEXT: s_sub_i32 s2, s5, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s7
-; GFX8-NEXT: s_cmp_ge_u32 s2, s7
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s7
-; GFX8-NEXT: s_cmp_ge_u32 s2, s7
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mul_i32 s0, s0, s7
+; GFX8-NEXT: s_sub_i32 s0, s5, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s7
+; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s7
+; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
%result0 = udiv <2 x i32> %x, %y
@@ -419,14 +420,11 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-LABEL: test_udivrem_v4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX8-NEXT: s_sub_i32 s2, 0, s8
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -445,9 +443,9 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: s_cselect_b32 s2, s3, s2
; GFX8-NEXT: s_sub_i32 s3, s2, s8
; GFX8-NEXT: s_cmp_ge_u32 s2, s8
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, 0, s9
-; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
+; GFX8-NEXT: s_cselect_b32 s4, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s9
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
@@ -455,50 +453,54 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: s_mul_i32 s3, s3, s9
-; GFX8-NEXT: s_sub_i32 s3, s5, s3
-; GFX8-NEXT: s_sub_i32 s4, s3, s9
-; GFX8-NEXT: s_cmp_ge_u32 s3, s9
-; GFX8-NEXT: s_cselect_b32 s3, s4, s3
-; GFX8-NEXT: s_sub_i32 s4, s3, s9
-; GFX8-NEXT: s_cmp_ge_u32 s3, s9
-; GFX8-NEXT: s_cselect_b32 s3, s4, s3
-; GFX8-NEXT: s_sub_i32 s4, 0, s10
-; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_mul_i32 s2, s2, s9
+; GFX8-NEXT: s_sub_i32 s2, s5, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s9
+; GFX8-NEXT: s_cmp_ge_u32 s2, s9
+; GFX8-NEXT: s_cselect_b32 s2, s3, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s9
+; GFX8-NEXT: s_cmp_ge_u32 s2, s9
+; GFX8-NEXT: s_cselect_b32 s5, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s10
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mul_i32 s4, s4, s10
-; GFX8-NEXT: s_sub_i32 s4, s6, s4
-; GFX8-NEXT: s_sub_i32 s5, s4, s10
-; GFX8-NEXT: s_cmp_ge_u32 s4, s10
-; GFX8-NEXT: s_cselect_b32 s4, s5, s4
-; GFX8-NEXT: s_sub_i32 s5, s4, s10
-; GFX8-NEXT: s_cmp_ge_u32 s4, s10
-; GFX8-NEXT: s_cselect_b32 s4, s5, s4
-; GFX8-NEXT: s_sub_i32 s5, 0, s11
-; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_mul_i32 s2, s2, s10
+; GFX8-NEXT: s_sub_i32 s2, s6, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s10
+; GFX8-NEXT: s_cmp_ge_u32 s2, s10
+; GFX8-NEXT: s_cselect_b32 s2, s3, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s10
+; GFX8-NEXT: s_cmp_ge_u32 s2, s10
+; GFX8-NEXT: s_cselect_b32 s6, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s11
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s2, v3
-; GFX8-NEXT: s_mul_i32 s2, s2, s11
-; GFX8-NEXT: s_sub_i32 s2, s7, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s11
-; GFX8-NEXT: s_cmp_ge_u32 s2, s11
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s11
-; GFX8-NEXT: s_cmp_ge_u32 s2, s11
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_readfirstlane_b32 s0, v3
+; GFX8-NEXT: s_mul_i32 s0, s0, s11
+; GFX8-NEXT: s_sub_i32 s0, s7, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s11
+; GFX8-NEXT: s_cmp_ge_u32 s0, s11
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s11
+; GFX8-NEXT: s_cmp_ge_u32 s0, s11
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
%result0 = udiv <4 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 79b0a96..2a4066d 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -28,42 +28,42 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_uint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_min_u32 s4, s4, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_min_u32 s2, s0, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_ldexp_f32 v0, v0, s0
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_uint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s4, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s0, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -100,12 +100,12 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_uint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -116,8 +116,8 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
@@ -126,11 +126,11 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_uint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -145,7 +145,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,39 +180,39 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_uint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_min_u32 s4, s4, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_sub_i32 s0, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_min_u32 s2, s0, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_uint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s4, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s0, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -248,23 +248,23 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_uint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v2
; GFX8-NEXT: v_min_u32_e32 v4, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
@@ -273,11 +273,11 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_uint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -290,7 +290,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -333,26 +333,26 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_flbit_i32_b32 s3, s5
-; GFX8-NEXT: s_min_u32 s8, s2, 32
-; GFX8-NEXT: s_min_u32 s9, s3, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_flbit_i32_b32 s1, s5
+; GFX8-NEXT: s_min_u32 s8, s0, 32
+; GFX8-NEXT: s_min_u32 s9, s1, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_min_u32 s2, s4, 1
-; GFX8-NEXT: s_or_b32 s2, s5, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s8
-; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
-; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_min_u32 s0, s4, 1
+; GFX8-NEXT: s_or_b32 s0, s5, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s8
+; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -360,27 +360,27 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s7
-; GFX11-NEXT: s_clz_i32_u32 s3, s5
-; GFX11-NEXT: s_min_u32 s8, s2, 32
-; GFX11-NEXT: s_min_u32 s9, s3, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
+; GFX11-NEXT: s_clz_i32_u32 s1, s5
+; GFX11-NEXT: s_min_u32 s8, s0, 32
+; GFX11-NEXT: s_min_u32 s9, s1, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v0, v2, s1
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -443,19 +443,19 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v4
@@ -496,12 +496,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
@@ -540,7 +540,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
-; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v7, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -587,29 +587,29 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_flbit_i32_b32 s3, s5
-; GFX8-NEXT: s_min_u32 s8, s2, 32
-; GFX8-NEXT: s_min_u32 s9, s3, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_flbit_i32_b32 s1, s5
+; GFX8-NEXT: s_min_u32 s8, s0, 32
+; GFX8-NEXT: s_min_u32 s9, s1, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s9
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s0
; GFX8-NEXT: s_sub_i32 s6, 32, s8
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
-; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
+; GFX8-NEXT: v_ldexp_f32 v1, v1, s0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -617,32 +617,32 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s7
-; GFX11-NEXT: s_clz_i32_u32 s3, s5
-; GFX11-NEXT: s_min_u32 s8, s2, 32
-; GFX11-NEXT: s_min_u32 s9, s3, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
+; GFX11-NEXT: s_clz_i32_u32 s1, s5
+; GFX11-NEXT: s_min_u32 s8, s0, 32
+; GFX11-NEXT: s_min_u32 s9, s1, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v1, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -713,18 +713,18 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v4
; GFX8-NEXT: v_ffbh_u32_e32 v11, v2
@@ -763,7 +763,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
@@ -772,12 +772,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
@@ -825,7 +825,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 5f8d0f6..f4debc2 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
;
; VI-LABEL: uitofp_i16_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_i16_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @uitofp_i32_to_f16(
;
; VI-LABEL: uitofp_i32_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_i32_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,44 +168,44 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
;
; VI-LABEL: uitofp_v2i16_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_v2i16_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,39 +244,39 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
;
; VI-LABEL: uitofp_v2i32_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_v2i32_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
@@ -285,7 +285,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,20 +357,19 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -378,9 +377,10 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index f60a274..5fc395b 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -244,12 +244,12 @@ define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
; VI-NEXT: s_cbranch_vccnz .LBB4_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB4_2: ; %endif
; VI-NEXT: s_endpgm
entry:
@@ -296,12 +296,12 @@ define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, f
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0
; VI-NEXT: s_cbranch_vccnz .LBB5_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB5_2: ; %endif
; VI-NEXT: s_endpgm
entry:
@@ -342,20 +342,19 @@ define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out,
; VI-LABEL: uniform_if_else_ret:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB6_2
; VI-NEXT: ; %bb.1: ; %if.else
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v0, 2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB6_2: ; %if.then
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %a, 0
@@ -403,28 +402,29 @@ define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr
;
; VI-LABEL: uniform_if_else:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB7_2
; VI-NEXT: ; %bb.1: ; %if.else
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v0, 2
; VI-NEXT: s_branch .LBB7_3
; VI-NEXT: .LBB7_2: ; %if.then
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: .LBB7_3: ; %if.end
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: v_mov_b32_e32 v0, 3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %a, 0
@@ -530,13 +530,13 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p
; VI-NEXT: .LBB9_2: ; %bb9
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB9_3: ; %bb7
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -626,20 +626,20 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %
; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc
; VI-NEXT: s_cbranch_execz .LBB11_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_cbranch_scc0 .LBB11_3
; VI-NEXT: .LBB11_2: ; %endif
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB11_3: ; %if_uniform
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -692,18 +692,18 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %
; VI-NEXT: .LBB12_1: ; %endif
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB12_2: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc
; VI-NEXT: s_cbranch_execz .LBB12_1
; VI-NEXT: ; %bb.3: ; %if_uniform
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%u_cmp = icmp eq i32 %cond, 0
@@ -832,16 +832,16 @@ define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr
; VI-NEXT: s_cmp_lt_i32 s2, 1
; VI-NEXT: s_cbranch_scc1 .LBB14_2
; VI-NEXT: ; %bb.1: ; %bb2
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB14_2: ; %bb9
; VI-NEXT: s_endpgm
bb:
@@ -886,20 +886,20 @@ define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %ou
;
; VI-LABEL: uniform_if_scc_i64_eq:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_eq_u64 s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_eq_u64 s[4:5], 0
; VI-NEXT: s_cbranch_scc1 .LBB15_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB15_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i64 %cond, 0
@@ -940,20 +940,20 @@ define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %ou
;
; VI-LABEL: uniform_if_scc_i64_ne:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
; VI-NEXT: s_cbranch_scc1 .LBB16_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB16_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp ne i64 %cond, 0
@@ -994,21 +994,21 @@ define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %o
;
; VI-LABEL: uniform_if_scc_i64_sgt:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: s_and_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[4:5], 0
+; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccnz .LBB17_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB17_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp sgt i64 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 666ae7c1..092d74f 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-LABEL: s_usubo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_sub_u32 s0, s6, s0
+; VI-NEXT: s_sub_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_subb_u32 s1, s7, s1
+; VI-NEXT: s_subb_u32 s1, s7, s3
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -96,12 +96,12 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_usubo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -161,18 +161,18 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -243,18 +243,18 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_usubo_i32_novcc:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_usubo_i32_novcc:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -325,19 +325,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_usubo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_sub_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_subb_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_sub_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_subb_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
@@ -401,18 +401,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,18 +486,18 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ushort v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_gt_u16_e32 vcc, v5, v4
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -568,18 +568,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_usubo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_usubo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
index ca4d689..f20a92d 100644
--- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
; GFX9-LABEL: test_add_co_sdwa:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT: global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index e5de7d0..27dcdf9 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -40,15 +40,15 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_eq_u32 s2, 0
+; VI-NEXT: s_cmp_eq_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -79,14 +79,14 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s2, 0
+; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -185,14 +185,14 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -213,13 +213,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -279,13 +279,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,14 +317,14 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -345,13 +345,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -411,13 +411,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -494,13 +494,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -579,13 +579,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -703,14 +703,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -744,12 +744,12 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
@@ -795,14 +795,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -836,12 +836,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
@@ -888,14 +888,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -931,12 +931,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
@@ -988,15 +988,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1037,13 +1037,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1097,15 +1097,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1146,13 +1146,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1208,15 +1208,15 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1257,13 +1257,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1316,14 +1316,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_load_dword v2, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1363,12 +1363,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1470,13 +1470,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
@@ -1527,15 +1527,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1572,13 +1572,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1626,14 +1626,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1667,12 +1667,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
@@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1772,12 +1772,12 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
@@ -1839,17 +1839,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1882,17 +1882,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1937,15 +1937,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1976,14 +1976,14 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2032,18 +2032,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2077,10 +2077,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
@@ -2088,7 +2088,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index f7933d7..4b76d5c 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -56,25 +56,25 @@ define amdgpu_kernel void @madak_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -170,15 +170,15 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
-; GFX11-NEXT: s_mov_b32 s14, -1
-; GFX11-NEXT: s_mov_b32 s15, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s14
-; GFX11-NEXT: s_mov_b32 s19, s15
-; GFX11-NEXT: s_mov_b32 s22, s14
-; GFX11-NEXT: s_mov_b32 s23, s15
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s16, s8
; GFX11-NEXT: s_mov_b32 s17, s9
@@ -188,19 +188,21 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v2, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s4
-; GFX11-NEXT: s_mov_b32 s13, s5
-; GFX11-NEXT: s_mov_b32 s0, s6
-; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1
; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
-; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 8bc8fbd..c2abd4f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -7,12 +7,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -24,12 +24,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
;
; GISEL-LABEL: v_pack_b32_v2f16:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -56,12 +56,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32_v2f16_sub:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -73,12 +73,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
;
; GISEL-LABEL: v_pack_b32_v2f16_sub:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -105,36 +105,36 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
define amdgpu_kernel void @fptrunc(
; GCN-LABEL: fptrunc:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s7, 0x31016000
-; GCN-NEXT: s_mov_b32 s10, s6
-; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x31016000
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s2
-; GCN-NEXT: s_mov_b32 s9, s3
-; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
-; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; GISEL-LABEL: fptrunc:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GISEL-NEXT: s_mov_b32 s6, -1
+; GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GISEL-NEXT: s_mov_b32 s2, -1
-; GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GISEL-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
@@ -147,12 +147,12 @@ define amdgpu_kernel void @fptrunc(
define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32.fabs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -164,12 +164,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
;
; GISEL-LABEL: v_pack_b32.fabs:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -198,12 +198,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32.fneg:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -215,12 +215,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
;
; GISEL-LABEL: v_pack_b32.fneg:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 7f69c47..6c8f288 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -413,18 +413,18 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) {
; SDAG-VI-LABEL: vec_smax_smin_sgpr:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16
-; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0
+; SDAG-VI-NEXT: s_lshr_b32 s0, s4, 16
+; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0
+; SDAG-VI-NEXT: v_max_i16_e64 v2, s0, 0
; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dword v[0:1], v2
; SDAG-VI-NEXT: s_endpgm
;
@@ -443,41 +443,41 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; SDAG-GFX11-LABEL: vec_smax_smin_sgpr:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0
+; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: vec_smax_smin_sgpr:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_sext_i32_i16 s0, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16
-; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
+; GISEL-VI-NEXT: s_lshr_b32 s1, s4, 16
; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4
-; GISEL-VI-NEXT: s_max_i32 s2, s2, s3
-; GISEL-VI-NEXT: s_max_i32 s3, s4, s3
+; GISEL-VI-NEXT: s_sext_i32_i16 s1, s1
+; GISEL-VI-NEXT: s_max_i32 s4, s4, s0
+; GISEL-VI-NEXT: s_max_i32 s0, s1, s0
+; GISEL-VI-NEXT: s_sext_i32_i16 s1, s4
; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_min_i32 s3, s3, s4
-; GISEL-VI-NEXT: s_min_i32 s2, s2, s4
-; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
-; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
-; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
-; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: s_sext_i32_i16 s0, s0
+; GISEL-VI-NEXT: s_min_i32 s0, s0, s4
+; GISEL-VI-NEXT: s_min_i32 s1, s1, s4
+; GISEL-VI-NEXT: s_and_b32 s0, 0xffff, s0
+; GISEL-VI-NEXT: s_and_b32 s1, 0xffff, s1
+; GISEL-VI-NEXT: s_lshl_b32 s0, s0, 16
+; GISEL-VI-NEXT: s_or_b32 s0, s1, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dword v[0:1], v2
; GISEL-VI-NEXT: s_endpgm
;
@@ -506,26 +506,26 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; GISEL-GFX11-LABEL: vec_smax_smin_sgpr:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s0, 0
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2
-; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16
-; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3
-; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s1, s4
+; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16
+; GISEL-GFX11-NEXT: s_max_i32 s0, s1, s0
+; GISEL-GFX11-NEXT: s_max_i32 s1, s4, 0
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2
-; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16
-; GISEL-GFX11-NEXT: s_min_i32 s3, s4, s3
-; GISEL-GFX11-NEXT: s_min_i32 s2, s2, 0xff
+; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s1, 0xff00ff
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s0
+; GISEL-GFX11-NEXT: s_ashr_i32 s0, s0, 16
+; GISEL-GFX11-NEXT: s_min_i32 s1, s4, s1
+; GISEL-GFX11-NEXT: s_min_i32 s0, s0, 0xff
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
index d5347f8..b60ae19 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
; GFX9-LABEL: test_sub_co_sdwa:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT: global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 340f0cd..836b1d4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -473,9 +473,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
; SI-NEXT: bb.1.if.then:
; SI-NEXT: successors: %bb.7(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
- ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec
- ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; SI-NEXT: early-clobber %27:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
+ ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %27.sub0, killed %48, 0, implicit $exec
+ ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %27.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -570,9 +570,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
; SI-NEXT: bb.1.if.then:
; SI-NEXT: successors: %bb.2(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
+ ; SI-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
- ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
+ ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %4, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index f78b408..ea48047 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -286,8 +286,8 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-LABEL: v32i8_liveout:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
@@ -319,7 +319,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
@@ -351,7 +351,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
; GFX906-NEXT: .LBB5_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33
; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
@@ -372,7 +372,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1]
+; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3]
; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20
; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -393,7 +393,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
+; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index e12a4be..a033d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -8,26 +8,26 @@
define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_i32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_i32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
@@ -41,26 +41,26 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_f32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_f32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
@@ -101,28 +101,28 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) {
define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_2xf16:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_le_f16_sdwa vcc_lo, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX1032-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_2xf16:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX1064-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid
@@ -321,10 +321,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1: ; %if
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v0, s[2:3]
; GFX1032-NEXT: .LBB9_2: ; %endif
; GFX1032-NEXT: s_endpgm
;
@@ -334,10 +334,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1: ; %if
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v0, s[2:3]
; GFX1064-NEXT: .LBB9_2: ; %endif
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -355,9 +355,9 @@ endif:
define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-LABEL: test_loop_with_if:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1032-NEXT: s_branch .LBB10_2
; GFX1032-NEXT: .LBB10_1: ; %bb13
@@ -366,25 +366,25 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4
; GFX1032-NEXT: v_add_nc_u32_e32 v1, 1, v4
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB10_8
; GFX1032-NEXT: .LBB10_2: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0
+; GFX1032-NEXT: s_mov_b32 s1, 0
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_4
; GFX1032-NEXT: ; %bb.3: ; %bb5
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s2, v2
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
; GFX1032-NEXT: global_load_dword v4, v[2:3], off
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v4
@@ -399,13 +399,13 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: ; %bb.5: ; %bb11
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: v_lshrrev_b32_e32 v4, 31, v1
-; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo
+; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX1032-NEXT: v_add_nc_u32_e32 v4, v1, v4
; GFX1032-NEXT: v_ashrrev_i32_e32 v4, 1, v4
; GFX1032-NEXT: ; %bb.6: ; %Flow1
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_and_saveexec_b32 s4, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s4, s1
; GFX1032-NEXT: s_cbranch_execz .LBB10_1
; GFX1032-NEXT: ; %bb.7: ; %bb10
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
@@ -417,9 +417,9 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
;
; GFX1064-LABEL: test_loop_with_if:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1064-NEXT: s_branch .LBB10_2
; GFX1064-NEXT: .LBB10_1: ; %bb13
@@ -428,8 +428,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0xfe, v4
; GFX1064-NEXT: v_add_nc_u32_e32 v1, 1, v4
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB10_8
; GFX1064-NEXT: .LBB10_2: ; %bb2
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -445,8 +445,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_add_co_u32 v2, vcc, s0, v2
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s1, v3, vcc
+; GFX1064-NEXT: v_add_co_u32 v2, vcc, s2, v2
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s3, v3, vcc
; GFX1064-NEXT: global_load_dword v4, v[2:3], off
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v4
@@ -516,43 +516,43 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #
; GFX1032-LABEL: test_loop_with_if_else_break:
; GFX1032: ; %bb.0: ; %bb
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: ; %bb.1: ; %.preheader
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_mov_b32 s3, 0
-; GFX1032-NEXT: ; implicit-def: $sgpr4
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: ; implicit-def: $sgpr1
; GFX1032-NEXT: s_branch .LBB11_4
; GFX1032-NEXT: .LBB11_2: ; %bb8
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: s_add_i32 s3, s3, 1
-; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1
-; GFX1032-NEXT: s_add_u32 s0, s0, 4
-; GFX1032-NEXT: s_addc_u32 s1, s1, 0
-; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
+; GFX1032-NEXT: s_add_i32 s0, s0, 1
+; GFX1032-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v1
+; GFX1032-NEXT: s_add_u32 s2, s2, 4
+; GFX1032-NEXT: s_addc_u32 s3, s3, 0
+; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX1032-NEXT: s_or_b32 s4, s4, s5
+; GFX1032-NEXT: s_or_b32 s1, s1, s5
; GFX1032-NEXT: .LBB11_3: ; %Flow
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4
-; GFX1032-NEXT: s_or_b32 s2, s5, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_and_b32 s5, exec_lo, s1
+; GFX1032-NEXT: s_or_b32 s4, s5, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: .LBB11_4: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v3, v2, s[0:1]
-; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo
+; GFX1032-NEXT: global_load_dword v3, v2, s[2:3]
+; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3
; GFX1032-NEXT: s_cbranch_vccz .LBB11_2
; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: ; implicit-def: $sgpr3
-; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX1032-NEXT: ; implicit-def: $sgpr0
+; GFX1032-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX1032-NEXT: s_branch .LBB11_3
; GFX1032-NEXT: .LBB11_6: ; %.loopexit
; GFX1032-NEXT: s_endpgm
@@ -564,39 +564,39 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
; GFX1064-NEXT: ; %bb.1: ; %.preheader
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1064-NEXT: s_branch .LBB11_4
; GFX1064-NEXT: .LBB11_2: ; %bb8
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: s_add_i32 s6, s6, 1
-; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1064-NEXT: v_cmp_ge_u32_e32 vcc, s6, v1
-; GFX1064-NEXT: s_add_u32 s0, s0, 4
-; GFX1064-NEXT: s_addc_u32 s1, s1, 0
+; GFX1064-NEXT: s_add_u32 s2, s2, 4
+; GFX1064-NEXT: s_addc_u32 s3, s3, 0
; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
; GFX1064-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GFX1064-NEXT: .LBB11_3: ; %Flow
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: s_and_b64 s[8:9], exec, s[4:5]
-; GFX1064-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
; GFX1064-NEXT: .LBB11_4: ; %bb2
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX1064-NEXT: global_load_dword v3, v2, s[2:3]
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], exec
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v3
; GFX1064-NEXT: s_cbranch_vccz .LBB11_2
; GFX1064-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: ; implicit-def: $sgpr6
-; GFX1064-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX1064-NEXT: s_branch .LBB11_3
; GFX1064-NEXT: .LBB11_6: ; %.loopexit
; GFX1064-NEXT: s_endpgm
@@ -631,26 +631,26 @@ bb8:
define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_addc_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_addc_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -664,26 +664,26 @@ bb:
define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_subbrev_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_subbrev_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -697,26 +697,26 @@ bb:
define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_subb_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s6, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_subb_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s6, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1063,30 +1063,30 @@ bb:
define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX1032-LABEL: test_div_scale_f32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1032-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1032-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_div_scale_f32 v1, s2, v2, v2, v1
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: v_div_scale_f32 v1, s0, v2, v2, v1
+; GFX1032-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_div_scale_f32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1064-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1064-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_div_scale_f32 v1, s[2:3], v2, v2, v1
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], v2, v2, v1
+; GFX1064-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1064-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1106,30 +1106,32 @@ define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspa
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_div_scale_f64:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1064-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -1451,11 +1453,11 @@ define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrs
; GCN-NEXT: s_bitcmp0_b32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB25_2
; GCN-NEXT: ; %bb.1: ; %store
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0xde
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: global_store_dword v0, v1, s[2:3]
; GCN-NEXT: .LBB25_2: ; %end
; GCN-NEXT: s_endpgm
%cmp0 = icmp ne i1 %val, 0
@@ -1634,7 +1636,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
@@ -1643,13 +1645,13 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0
; GFX1032-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1032-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1658,7 +1660,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
; GFX1064-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1064-NEXT: s_endpgm
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1704,30 +1706,30 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
; GFX1032-LABEL: test_set_inactive_64:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, s6
+; GFX1032-NEXT: v_mov_b32_e32 v1, s7
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_set_inactive_64:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s7
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
store i64 %tmp, ptr addrspace(1) %out
@@ -2354,42 +2356,42 @@ define amdgpu_ps float @test_ps_live() #0 {
define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX1032-LABEL: test_vccnz_ifcvt_triangle64:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_cmp_neq_f64_e64 s4, s[2:3], 1.0
-; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s4
+; GFX1032-NEXT: v_cmp_neq_f64_e64 s2, s[0:1], 1.0
+; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_vccnz .LBB47_2
; GFX1032-NEXT: ; %bb.1: ; %if
-; GFX1032-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1032-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1]
; GFX1032-NEXT: s_branch .LBB47_3
; GFX1032-NEXT: .LBB47_2:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: .LBB47_3: ; %endif
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vccnz_ifcvt_triangle64:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0
-; GFX1064-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX1064-NEXT: v_cmp_neq_f64_e64 s[2:3], s[0:1], 1.0
+; GFX1064-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_vccnz .LBB47_2
; GFX1064-NEXT: ; %bb.1: ; %if
-; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1064-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1]
; GFX1064-NEXT: s_branch .LBB47_3
; GFX1064-NEXT: .LBB47_2:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s1
; GFX1064-NEXT: .LBB47_3: ; %endif
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
entry:
%v = load double, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index e0b320a..025b856 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -22,11 +22,11 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i16_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 4
@@ -36,10 +36,10 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i16_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s0, 0x3e7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -76,11 +76,11 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
;
; VI-LABEL: widen_i16_constant_load_zext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -91,10 +91,10 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
;
; GFX11-LABEL: widen_i16_constant_load_zext_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -134,11 +134,11 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
;
; VI-LABEL: widen_i16_constant_load_sext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -149,10 +149,10 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
;
; GFX11-LABEL: widen_i16_constant_load_sext_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i16 s0, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -199,13 +199,13 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i17_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, 2
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_i32 s0, s0, 34
; VI-NEXT: s_or_b32 s0, s0, 4
@@ -218,10 +218,10 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i17_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s0, s0, 34
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -263,11 +263,11 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_f16_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f16_e64 v2, s0, 4.0
; VI-NEXT: flat_store_short v[0:1], v2
@@ -275,11 +275,11 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_f16_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
@@ -317,11 +317,11 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_v2i8_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 44
; VI-NEXT: v_mov_b32_e32 v1, 3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -338,9 +338,9 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_v2i8_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v0, s0, 12
; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0
@@ -387,11 +387,11 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
;
; VI-LABEL: no_widen_i16_constant_divergent_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -404,10 +404,10 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
;
; GFX11-LABEL: no_widen_i16_constant_divergent_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7
; GFX11-NEXT: v_mov_b32_e32 v0, 0
@@ -446,11 +446,11 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i1_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -459,10 +459,10 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i1_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -497,11 +497,11 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
;
; VI-LABEL: widen_i16_zextload_i64_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -512,10 +512,10 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
;
; GFX11-LABEL: widen_i16_zextload_i64_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -556,11 +556,11 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
;
; VI-LABEL: widen_i1_zext_to_i64_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 1
; VI-NEXT: s_add_u32 s0, s0, 0x3e7
@@ -572,9 +572,9 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
;
; GFX11-LABEL: widen_i1_zext_to_i64_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -667,11 +667,11 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
;
; VI-LABEL: widen_i16_global_invariant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 1
@@ -681,10 +681,10 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
;
; GFX11-LABEL: widen_i16_global_invariant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s0, 0x3e7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 5422bfa..54240ad 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-LABEL: xor_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
@@ -80,12 +80,12 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-LABEL: xor_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; VI-NEXT: v_mov_b32_e32 v8, s4
@@ -134,12 +134,12 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; VI-LABEL: xor_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -190,12 +190,12 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
; VI-LABEL: v_xor_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_ubyte v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ubyte v2, v[2:3] glc
@@ -239,12 +239,12 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-LABEL: vector_xor_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -304,13 +304,13 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
;
; VI-LABEL: scalar_not_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_not_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_not_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%result = xor i32 %a, -1
@@ -339,13 +339,13 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: vector_not_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -384,12 +384,12 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-LABEL: vector_xor_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
@@ -425,10 +425,10 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-LABEL: scalar_xor_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -456,12 +456,12 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: scalar_not_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_not_b64 s[0:1], s[2:3]
+; VI-NEXT: s_not_b64 s[0:1], s[6:7]
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -492,13 +492,13 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: vector_not_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_not_b32_e32 v0, v0
; VI-NEXT: v_not_b32_e32 v1, v1
@@ -545,25 +545,25 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
;
; VI-LABEL: xor_cf:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
; VI-NEXT: s_cbranch_scc0 .LBB12_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; VI-NEXT: s_cbranch_vccnz .LBB12_3
; VI-NEXT: .LBB12_2: ; %if
-; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
+; VI-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB12_3: ; %endif
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -606,14 +606,14 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: scalar_xor_literal_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s3, s3, 0xf237b
-; VI-NEXT: s_xor_b32 s2, s2, 0x3039
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_xor_b32 s0, s3, 0xf237b
+; VI-NEXT: s_xor_b32 s1, s2, 0x3039
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
@@ -647,15 +647,15 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; VI-LABEL: scalar_xor_literal_multi_use_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x3039
-; VI-NEXT: s_mov_b32 s3, 0xf237b
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x3039
+; VI-NEXT: s_mov_b32 s1, 0xf237b
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s6, 0x3039
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_addc_u32 s1, s7, 0xf237b
@@ -689,13 +689,13 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; VI-LABEL: scalar_xor_inline_imm_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 63
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_xor_b32 s0, s2, 63
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%or = xor i64 %a, 63
@@ -720,13 +720,13 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; VI-LABEL: scalar_xor_neg_inline_imm_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], -8
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
%or = xor i64 %a, -8
@@ -756,13 +756,13 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
;
; VI-LABEL: vector_xor_i64_neg_inline_imm:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_xor_b32_e32 v0, -8, v0
; VI-NEXT: v_xor_b32_e32 v1, -1, v1
@@ -796,13 +796,13 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: vector_xor_literal_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_xor_b32_e32 v1, 0x146f, v1
; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index f9137b0..af50e09 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
+; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN: buffer_store_short [[RESULT]]